In [21]:
import pandas as pd

In [22]:
from google.colab import files
uploaded = files.upload()

# Get the filename from the uploaded object
file_name = list(uploaded.keys())[0]
print(f"File '{file_name}' uploaded successfully.")

Saving gates_grants.csv.csv to gates_grants.csv.csv
File 'gates_grants.csv.csv' uploaded successfully.


In [24]:
# Display the first 5 rows
print("\n--- Head of the DataFrame ---")
print(df.head())


--- Head of the DataFrame ---
                             updated_15_october_2025
0  GRANT ID,GRANTEE,PURPOSE,DIVISION,DATE COMMITT...
1  INV-002690,World Health Organization,"to reduc...
2  INV-003934,Smithsonian Institution,"to endow t...
3  INV-004622,Praedicare Inc,to evaluate novel TB...
4  INV-015740,Africa Resource Center for Excellen...


In [25]:
# Display column names, non-null counts, and dtypes
print("\n--- DataFrame Information ---")
df.info()


--- DataFrame Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39217 entries, 0 to 39216
Data columns (total 1 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   updated_15_october_2025  39216 non-null  object
dtypes: object(1)
memory usage: 306.5+ KB


In [26]:
# Display descriptive statistics for numerical columns
print("\n--- Descriptive Statistics ---")
print(df.describe())


--- Descriptive Statistics ---
                                  updated_15_october_2025
count                                               39216
unique                                              39216
top     INV-096242,Institut de Recherche en Sciences d...
freq                                                    1


In [27]:
# Create a dictionary for renaming (example based on the image)
# You would need to check all columns with df.columns for a complete list
new_column_names = {
    'GRANT ID': 'grant_id',
    'GRANTEE': 'grantee',
    'PURPOSE': 'purpose',
    'DIVISION': 'division',
    'DATE COMMITTED': 'date_committed',
    'DURATION (MONTHS)': 'duration_months',
    'AMT COMMITTED': 'amt_committed',
    'GRANTEE WEBSITE': 'grantee_website',
    'GRANTEE CITY': 'grantee_city',
    'GRANTEE STATE': 'grantee_state',
    'GRANTEE COUNTRY REGION/RESERVED': 'grantee_country_region',
    'TOPIC': 'topic'
}

df.rename(columns=new_column_names, inplace=True)

# Display the new columns
print("\n--- New Column Names ---")
print(df.columns)


--- New Column Names ---
Index(['updated_15_october_2025'], dtype='object')


In [29]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Re-loading the DataFrame with the correct delimiter/settings
# NOTE: Replace 'gates_grants.csv' with the actual file name if different.
# I will use the corrected loading attempt here, assuming the user fixed the delimiter.
# For demonstration, I'll use a standard read_csv and assume the data is correct now.
try:
    df = pd.read_csv('gates_grants.csv', encoding='latin1')
except FileNotFoundError:
    print("Warning: 'gates_grants.csv' not found. Please ensure the file is uploaded or correctly named.")
    # Stop execution if file isn't found
    # In a real environment, you'd handle this more robustly.
    pass

# Display the columns to ensure we are using the correct names from the header
print("--- Columns from Raw Data (Check for spaces/case) ---")
print(df.columns)

--- Columns from Raw Data (Check for spaces/case) ---
Index(['ï»¿"Updated 15 October', ' 2025"'], dtype='object')


In [5]:
import pandas as pd
import numpy as np
import io

# --- 1. Robust Data Loading ---
file_name = 'gates_grants.csv'

try:
    # Read the data by skipping the junk header line and using comma delimiter
    with open(file_name, 'r', encoding='latin1') as f:
        lines = f.readlines()
        # Find the actual header line
        header_index = next(i for i, line in enumerate(lines) if 'GRANT ID' in line)
        data_string = "".join(lines[header_index:])

    df = pd.read_csv(io.StringIO(data_string), sep=',')

    # Clean up column names by stripping whitespace (CRITICAL)
    df.columns = df.columns.str.strip()

    # 2. Define the mapping using the CORRECT column names
    column_map = {
        'GRANT ID': 'Grant_ID',
        'PURPOSE': 'Project_Title',
        'AMOUNT COMMITTED': 'Grant_Amount', # Confirmed correct column name
        'REGION SERVED': 'Region',
        'TOPIC': 'Grant Topic',
        'DATE COMMITTED': 'Start Date'
    }

    # 3. Filter and Rename
    df_temp = df.filter(items=column_map.keys())
    df_temp.rename(columns=column_map, inplace=True)

    # 4. Data Cleaning
    # Clean 'Grant_Amount' column
    df_temp.loc[:, 'Grant_Amount'] = (
        df_temp['Grant_Amount']
        .astype(str)
        .replace({r'[\$,]': ''}, regex=True)
        .astype(float)
    )

    # Standardize Region and Segment Sector
    df_temp.loc[:, 'Region'] = (
        df_temp['Region']
        .fillna('Unknown')
        .str.upper()
        .str.strip()
    )

    df_temp.loc[:, 'Sector'] = (
        df_temp['Grant Topic']
        .str.split(';')
        .str[0]
        .str.strip()
    )
    df_temp.drop(columns=['Grant Topic'], inplace=True)

    # 5. Create 'End Date', Filter, and Finalize df_clean
    df_temp['End Date'] = np.nan
    df_clean = df_temp[df_temp['Grant_Amount'] > 0].copy()

    # Final Schema Selection
    final_columns = ['Grant_ID', 'Project_Title', 'Grant_Amount', 'Region', 'Sector', 'Start Date', 'End Date']
    df_clean = df_clean.filter(items=final_columns)

    # 6. Save the clean data
    clean_file_name = 'cleaned_grants_data.csv'
    df_clean.to_csv(clean_file_name, index=False)

    print("✅ Phase 1: Data Extraction and Cleaning COMPLETE.")
    print("--- Final Cleaned Data Preview ---")
    print(df_clean.head())

except Exception as e:
    print(f"❌ Phase 1 Error: {e}")

✅ Phase 1: Data Extraction and Cleaning COMPLETE.
--- Final Cleaned Data Preview ---
     Grant_ID                                      Project_Title  \
0  INV-002690  to reduce cholera's disease burden in both epi...   
1  INV-003934  to endow the museumâs permanent collection o...   
2  INV-004622  to evaluate novel TB drug combinations in the ...   
3  INV-015740  to strengthen supply chain systems performance...   
4  INV-016370  to develop shelf-stable, locally-sourced, micr...   

   Grant_Amount   Region                                             Sector  \
0     1078614.0   GLOBAL     Enterics, Diagnostics, Genomics & Epidemiology   
1     1500000.0  AMERICA                   Community Engagement Grantmaking   
2      631733.0  AMERICA                                       Tuberculosis   
3     4955723.0   AFRICA  Family Planning|Global Health and Development ...   
4     3495385.0     ASIA      Maternal, Newborn, Child Nutrition and Health   

  Start Date  End Date  
0    2

In [6]:
from sqlalchemy import create_engine, text
import pandas as pd

# Load the cleaned data from the file saved in Phase 1
clean_file_name = 'cleaned_grants_data.csv'
try:
    df_clean = pd.read_csv(clean_file_name)
except FileNotFoundError:
    print("❌ Phase 2 Error: 'cleaned_grants_data.csv' not found. Run Phase 1 first.")
    exit()

# --- Step 2.1: Setup SQL Database and Connection ---
db_connection_str = 'sqlite:///aid_analysis.db'
engine = create_engine(db_connection_str)

# Load DataFrame into the SQL Table named 'grants_raw'
df_clean.to_sql(
    'grants_raw',
    con=engine,
    if_exists='replace',
    index=False
)
print("✅ Data successfully loaded into 'grants_raw' table in aid_analysis.db.")

# --- Step 2.2: SQL Data Modeling (Create View) ---
sql_query = """
CREATE VIEW Regional_Funding_Summary AS
SELECT
    Region,
    Sector,
    COUNT(DISTINCT Grant_ID) AS Total_Projects,
    SUM(Grant_Amount) AS Total_Committed_Funds,
    AVG(Grant_Amount) AS Avg_Project_Size,
    SUM(CASE WHEN Grant_Amount >= 1000000 THEN 1 ELSE 0 END) AS High_Value_Project_Count
FROM
    grants_raw
GROUP BY
    Region,
    Sector
ORDER BY
    Total_Committed_Funds DESC;
"""

# Execute the CREATE VIEW query using text() (CRITICAL FIX)
with engine.connect() as connection:
    connection.execute(text(sql_query))
    connection.commit()
    print("✅ SQL View 'Regional_Funding_Summary' created successfully.")

    # Test Query
    test_query = "SELECT * FROM Regional_Funding_Summary LIMIT 5;"
    test_df = pd.read_sql(test_query, con=connection)

print("\n--- Phase 2 Complete: Test Query Result ---")
print(test_df)

✅ Data successfully loaded into 'grants_raw' table in aid_analysis.db.
✅ SQL View 'Regional_Funding_Summary' created successfully.

--- Phase 2 Complete: Test Query Result ---
    Region                                             Sector  Total_Projects  \
0  AMERICA                                     K-12 Education            3380   
1   AFRICA                           Agricultural Development             760   
2   GLOBAL                                   Vaccine Delivery             112   
3   GLOBAL  Global Health and Development Public Awareness...             441   
4   GLOBAL     Delivery of Solutions to Improve Global Health             236   

   Total_Committed_Funds  Avg_Project_Size  High_Value_Project_Count  
0           4.690743e+09      1.387794e+06                       986  
1           4.195375e+09      5.520231e+06                       523  
2           3.382828e+09      3.020382e+07                        57  
3           3.107717e+09      7.046977e+06           

In [7]:
from google.colab import files

# Download the final database file
try:
    files.download('aid_analysis.db')
    print("\n✅ Phase 3: 'aid_analysis.db' downloaded successfully.")
    print("You can now connect this file to Power BI Desktop.")
except Exception as e:
    print(f"❌ Phase 3 Error: Could not download 'aid_analysis.db'. Ensure Phase 2 ran successfully.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Phase 3: 'aid_analysis.db' downloaded successfully.
You can now connect this file to Power BI Desktop.
