In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import cdist

In [27]:
#importing the df from .dta
df = pd.read_stata('data/springfield_fixed.dta')
df.head()

Unnamed: 0,SchoolID,student_id,Lat,Lon,TreatedSchool,SES,female,baseline_grades,cell_phone_user,final_grades
0,1.0,1.0,73.860489,76.908592,0.0,5.058456,1.0,78.0,1.0,76.0
1,1.0,2.0,73.860489,76.908592,0.0,5.414133,0.0,66.0,1.0,70.0
2,1.0,3.0,73.860489,76.908592,0.0,4.853347,1.0,70.0,1.0,70.0
3,1.0,4.0,73.860489,76.908592,0.0,3.902148,0.0,68.0,1.0,72.0
4,1.0,5.0,73.860489,76.908592,0.0,5.37835,0.0,73.0,1.0,73.0


In [32]:
#getting each student count for each school
df["StudentCount"] = df.groupby("SchoolID")["student_id"].transform("count")
df.head(700)

Unnamed: 0,SchoolID,student_id,Lat,Lon,TreatedSchool,SES,female,baseline_grades,cell_phone_user,final_grades,StudentCount
0,1.0,1.0,73.860489,76.908592,0.0,5.058456,1.0,78.0,1.0,76.0,50
1,1.0,2.0,73.860489,76.908592,0.0,5.414133,0.0,66.0,1.0,70.0,50
2,1.0,3.0,73.860489,76.908592,0.0,4.853347,1.0,70.0,1.0,70.0,50
3,1.0,4.0,73.860489,76.908592,0.0,3.902148,0.0,68.0,1.0,72.0,50
4,1.0,5.0,73.860489,76.908592,0.0,5.378350,0.0,73.0,1.0,73.0,50
...,...,...,...,...,...,...,...,...,...,...,...
695,10.0,68.0,11.538972,35.308525,1.0,-4.816131,0.0,49.0,1.0,43.0,92
696,10.0,69.0,11.538972,35.308525,1.0,-3.960096,0.0,48.0,1.0,53.0,92
697,10.0,70.0,11.538972,35.308525,1.0,-5.339378,0.0,57.0,0.0,64.0,92
698,10.0,71.0,11.538972,35.308525,1.0,-4.406677,1.0,62.0,0.0,61.0,92


In [39]:
def calculate_and_save_spillover_effects(df, output_path):
    """
    Calculate spillover effects for schools based on distance ranges and save to Stata format.
    
    Parameters:
    df: pandas DataFrame with columns SchoolID, student_id, Lat, Lon, TreatedSchool, 
        SES, female, baseline_grades, cell_phone_user, final_grades, StudentCount
    output_path: str, path where to save the Stata file
    """
    # Create a unique schools dataframe with location and aggregate metrics
    schools_df = df.groupby('SchoolID').agg({
        'Lat': 'first',
        'Lon': 'first',
        'StudentCount': 'first',
        'cell_phone_user': 'sum'  # Total number of cell phone users in each school
    }).reset_index()
    
    # Calculate distances between all schools
    coords = schools_df[['Lat', 'Lon']].values
    distances = cdist(coords, coords)
    
    # Create distance ranges
    ranges = [(0, 10), (10, 20)]
    
    # Initialize columns for results
    for start, end in ranges:
        range_name = f"{start}{end}"
        schools_df[f'nearby_pupils_{range_name}'] = 0
        schools_df[f'nearby_cell_phone_users_{range_name}'] = 0
    
    # Calculate spillover effects for each school
    for i, school in schools_df.iterrows():
        for start, end in ranges:
            range_name = f"{start}{end}"
            
            # Create boolean mask for schools within distance range
            in_range = (distances[i] > start) & (distances[i] <= end)
            
            # Calculate total nearby pupils
            schools_df.at[i, f'nearby_pupils_{range_name}'] = \
                schools_df.loc[in_range, 'StudentCount'].sum()
            
            # Calculate total nearby cell phone users
            schools_df.at[i, f'nearby_cell_phone_users_{range_name}'] = \
                schools_df.loc[in_range, 'cell_phone_user'].sum()
    
    # Merge results back to original dataframe
    result_df = df.merge(
        schools_df.drop(['Lat', 'Lon', 'StudentCount', 'cell_phone_user'], axis=1),
        on='SchoolID'
    )
    
    # Save to Stata format
    result_df.to_stata(output_path, write_index=False)
    
    return result_df

# Execute the function
result = calculate_and_save_spillover_effects(df, 'data/springfield_fixed.dta')
print(f"Data has been processed and saved to data/springfield_fixed.dta")

Data has been processed and saved to data/springfield_fixed.dta
