In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
import seaborn as sns
import dask.dataframe as dd

In [21]:
# set the path to the data
data_in = "ipumsi_00036.parquet"
path = "E:/680_temp/Daily_temp"

# read the data
ddf = dd.read_parquet(f"{path}\\{data_in}")

# rename the columns and assign to a new Dask DataFrame
ddf = ddf.rename(columns={'country': 'cntry', 'migrate5': 'MGRATE5', 'migratep': 'mgratep'})

# create a new variable
ddf = ddf.assign(migrate=None)

# dealing with the migration variable for Costa Rica
# if country is Costa Rica and migration status is provincial (20) or national (30), set migration to 1
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 188) & (ddf['MGRATE5'].isin([20, 30])), 1))

# if country is Costa Rica and MGRATE5 is 10, 11, or 12, set migration to 0
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 188) & (ddf['MGRATE5'].isin([10, 11, 12])), 0))

# deal with the migration variable for the Dominican Republic
# if country is the Dominican Republic and migration status is provincial (20) or national (30), set migration to 1
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 214) & (ddf['MGRATE5'].isin([20, 30])), 1))

# if country is the Dominican Republic and MGRATE5 is 10, 11, or 12, set migration to 0
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 214) & (ddf['MGRATE5'].isin([10, 11, 12])), 0))

# Deal with the migration variable for El Salvador

# Set migrate to 1 if country is El Salvador and mgratep shows migration on either Province level(20) or Country level(30)
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 222) & (ddf['mgratep'].isin([20, 30])), 1))

# Set migrate to missing if country is El Salvador and mgratep is 0, 98, or 99
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 222) & (ddf['mgratep'].isin([0, 98, 99])), None))

# Set migrate to 0 if country is El Salvador and mgratep is 10, 11, or 12
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 222) & (ddf['mgratep'].isin([10, 11, 12])), 0))

# Deal with the migration variable for Haiti

# Set migrate to 1 if country is Haiti and MGRATE5 shows migration on either Province level(20) or Country level(30)
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 332) & (ddf['MGRATE5'].isin([20, 30])), 1))

# No need to modify migrate variable if country is Haiti and MGRATE5 is 0, 98, or 99
# This step is actually unnecessary in Python, as we have already initialized all values to None

# Set migrate to 0 if country is Haiti and MGRATE5 is 10, 11, or 12
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 332) & (ddf['MGRATE5'].isin([10, 11, 12])), 0))

# Deal with the migration variable for Jamaica

# Set migrate to 1 if country is Jamaica and mgratep shows migration on either Parish level(20) or Country level(30)
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 388) & (ddf['mgratep'].isin([20, 30])), 1))

# No need to modify migrate variable if country is Jamaica and mgratep is 0, 98, or 99
# This step is actually unnecessary in Python, as we have already initialized all values to None

# Set migrate to 0 if country is Jamaica and mgratep is 10, 11, or 12
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 388) & (ddf['mgratep'].isin([10, 11, 12])), 0))

# Deal with the migration variable for Mexico

# Set migrate to 1 if country is Mexico and MGRATE5 shows migration on either State level(20) or Country level(30)
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 484) & (ddf['MGRATE5'].isin([20, 30])), 1))

# No need to modify migrate variable if country is Mexico and MGRATE5 is 0, 98, or 99
# This step is actually unnecessary in Python, as we have already initialized all values to None

# Set migrate to 0 if country is Mexico and MGRATE5 is 10, 11, or 12
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 484) & (ddf['MGRATE5'].isin([10, 11, 12])), 0))

# Deal with the migration variable for Nicaragua


# Set migrate to 1 if country is Nicaragua and MGRATE5 shows migration on either Department level(20) or Country level(30)
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 558) & (ddf['MGRATE5'].isin([20, 30])), 1))

# Set migrate to missing if country is Nicaragua and MGRATE5 is 0, 98, or 99
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 558) & (ddf['MGRATE5'].isin([0, 98, 99])), None))

# Set migrate to 0 if country is Nicaragua and MGRATE5 is 10, 11, or 12
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 558) & (ddf['MGRATE5'].isin([10, 11, 12])), 0))


In [22]:
import numpy as np

# Handling the migrate variable for Panama

# Set migrate to 1 if country is Panama and mgratep shows migration on either province or country level
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 591) & (ddf['mgratep'].isin([20, 30])), 1))

# Set migrate to NaN if country is Panama and mgratep indicates missing or unknown migration status
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 591) & (ddf['mgratep'].isin([0, 98, 99])), np.nan))

# Set migrate to 0 if country is Panama and mgratep indicates no significant migration
ddf = ddf.assign(migrate=ddf['migrate'].where((ddf['cntry'] == 591) & (ddf['mgratep'].isin([10, 11, 12])), 0))

# Dropping records where migrate is missing
ddf = ddf.dropna(subset=['migrate'])


In [25]:
import dask.dataframe as dd

# Generating the waveline variable


# Costa Rica
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 188) & (ddf['year'] == 2011), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 188) & (ddf['year'] == 2000), 0))

# Dominican Republic
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 214) & (ddf['year'] == 2010), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 214) & (ddf['year'] == 2002), 0))

# El Salvador
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 222) & (ddf['year'] == 2007), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 222) & (ddf['year'] == 1992), 0))

# Haiti
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 332) & (ddf['year'] == 2003), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 332) & (ddf['year'] == 1982), 0))

# Jamaica
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 388) & (ddf['year'] == 2001), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 388) & (ddf['year'] == 1991), 0))

# Mexico
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 484) & (ddf['year'] == 2010), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 484) & (ddf['year'] == 2000), 0))

# Nicaragua
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 558) & (ddf['year'] == 2005), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 558) & (ddf['year'] == 1995), 0))

# Panama
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 591) & (ddf['year'] == 2010), 1))
ddf = ddf.assign(wave=ddf['wave'].where((ddf['cntry'] == 591) & (ddf['year'] == 2000), 0))


In [32]:
# Generating the pre_dist variable

# Initialize pre_dist variable with missing values
ddf['pre_dist'] = np.nan

# Costa Rica
ddf = ddf.assign(pre_dist=ddf['pre_dist'].where((ddf['cntry'] == 188) & (ddf['migcr2'] != 9), 188000 + ddf['migcr2']))
ddf = ddf.assign(pre_dist=ddf['pre_dist'].where((ddf['cntry'] == 188) & (ddf['migrate'] == 0), ddf['geolev1']))

# Dominican Republic
ddf = ddf.assign(twodigit=ddf['migdo'] // 100)
ddf = ddf.assign(pre_dist=ddf['pre_dist'].where((ddf['cntry'] == 214) & (ddf['twodigit'] <= 50), 214000 + ddf['twodigit']))
ddf = ddf.assign(pre_dist=ddf['pre_dist'].where((ddf['cntry'] == 214) & (ddf['migrate'] == 0), ddf['geolev1']))


ValueError: Metadata inference failed in `add`.

Original error is below:
------------------------
TypeError("unsupported operand type(s) for +: 'int' and 'Categorical'")

Traceback:
---------
  File "d:\PYthon\lib\site-packages\dask\dataframe\utils.py", line 193, in raise_on_meta_error
    yield
  File "d:\PYthon\lib\site-packages\dask\dataframe\core.py", line 6487, in elemwise
    meta = partial_by_order(*parts, function=op, other=other)
  File "d:\PYthon\lib\site-packages\dask\utils.py", line 1327, in partial_by_order
    return function(*args2, **kwargs)
  File "d:\PYthon\lib\site-packages\pandas\core\ops\common.py", line 81, in new_method
    return method(self, other)
  File "d:\PYthon\lib\site-packages\pandas\core\arraylike.py", line 190, in __radd__
    return self._arith_method(other, roperator.radd)
  File "d:\PYthon\lib\site-packages\pandas\core\series.py", line 6112, in _arith_method
    return base.IndexOpsMixin._arith_method(self, other, op)
  File "d:\PYthon\lib\site-packages\pandas\core\base.py", line 1348, in _arith_method
    result = ops.arithmetic_op(lvalues, rvalues, op)
  File "d:\PYthon\lib\site-packages\pandas\core\ops\array_ops.py", line 224, in arithmetic_op
    res_values = op(left, right)
  File "d:\PYthon\lib\site-packages\pandas\core\roperator.py", line 11, in radd
    return right + left


In [None]:
# El Salvador
df.loc[(df['cntry'] == 222) & (df['migsv'] <= 14), 'pre_dist'] = 222000 + df['migsv']
df.loc[(df['cntry'] == 222) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Haiti
df.loc[(df['cntry'] == 332) & (df['might2'] // 10 <= 10), 'pre_dist'] = 332000 + df['might2'] // 10
df.loc[(df['cntry'] == 332) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Jamaica
df.loc[(df['cntry'] == 388) & (df['migjm'] <= 14), 'pre_dist'] = 388000 + df['migjm']
df.loc[(df['cntry'] == 388) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Mexico
df.loc[(df['cntry'] == 484) & (df['migmx2'] <= 32), 'pre_dist'] = 484000 + df['migmx2']
df.loc[(df['cntry'] == 484) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Nicaragua
df.loc[(df['cntry'] == 558) & (df['migni'] <= 97), 'pre_dist'] = 558000 + df['migni']
df.loc[(df['cntry'] == 558) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Panama
df.loc[(df['cntry'] == 591) & (df['migpa'] // 100 <= 10), 'pre_dist'] = 591000 + df['migpa'] // 100
df.loc[(df['cntry'] == 591) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']


In [None]:
# Continuing with the DataFrame 'df' from the previous step

# El Salvador
df.loc[(df['cntry'] == 222) & (df['migsv'] <= 14), 'pre_dist'] = 222000 + df['migsv']
df.loc[(df['cntry'] == 222) & (df['migrate'] == 0), 'pre_dist'] = df['geolev1']

# Haiti
df.loc[(df['cntry'] == 332) & (df['might2'] // 10 <= 10), 'pre_dist'] = 332000 + (df['might2'] // 10)

# Jamaica
df.loc[(df['cntry'] == 388) & (df['migjm'] <= 14), 'pre_dist'] = 388000 + df['migjm']

# Mexico
df.loc[(df['cntry'] == 484) & (df['migmx2'] <= 32), 'pre_dist'] = 484000 + df['migmx2']

# Nicaragua
df.loc[(df['cntry'] == 558) & (df['migni'] <= 97), 'pre_dist'] = 558000 + df['migni']

# Panama
df.loc[(df['cntry'] == 591) & ((df['migpa'] // 100) <= 10), 'pre_dist'] = 591000 + (df['migpa'] // 100)

# Dropping rows based on specific 'pre_dist' values
drop_conditions = df['pre_dist'].isin([np.nan, 558097, 558090, 332010, 214012, 214020, 214028, 214029, 214030, 214031, 214032, 214050])
df = df[~drop_conditions]

# Dropping specified columns
df.drop(['sample', 'serial', 'ownershipd', 'pernum', 'related', 'resident'], axis=1, inplace=True)


In [None]:
# Assuming 'df' is your DataFrame and it already includes an 'age' column

# Drop rows where individuals are younger than 15 or older than 65
df = df[(df['age'] >= 15) & (df['age'] <= 65)]

# Initialize cohort variables
df['aff_cohort_15_25'] = 0
df['aff_cohort_26_35'] = 0
df['aff_cohort_36_45'] = 0
df['aff_cohort_46_55'] = 0
df['aff_cohort_56_65'] = 0

# Populate cohort variables based on age
df.loc[(df['age'] >= 15) & (df['age'] <= 25), 'aff_cohort_15_25'] = 1
df.loc[(df['age'] >= 26) & (df['age'] <= 35), 'aff_cohort_26_35'] = 1
df.loc[(df['age'] >= 36) & (df['age'] <= 45), 'aff_cohort_36_45'] = 1
df.loc[(df['age'] >= 46) & (df['age'] <= 55), 'aff_cohort_46_55'] = 1
df.loc[(df['age'] >= 56) & (df['age'] <= 65), 'aff_cohort_56_65'] = 1


In [None]:

# Assuming df_main is your main dataset

# Load 5-year historical climate data for wave 0 and wave 1
df_rain_wave0 = pd.read_csv('cruts_5_rain_aerpp_wave0.csv')  # Adjust path as necessary
df_rain_wave1 = pd.read_csv('cruts_5_rain_aerpp_wave1.csv')  # Adjust path as necessary

# Rename the district identifier column to match that of the main dataset
df_rain_wave0.rename(columns={'GEOLEV1': 'pre_dist'}, inplace=True)
df_rain_wave1.rename(columns={'GEOLEV1': 'pre_dist'}, inplace=True)

# Separate the main dataset into two based on wave
df_wave0 = df_main[df_main['wave'] == 0]
df_wave1 = df_main[df_main['wave'] == 1]

# Merge the climate data with the main dataset based on pre_dist
merged_wave0 = df_wave0.merge(df_rain_wave0, on='pre_dist', how='inner')
merged_wave1 = df_wave1.merge(df_rain_wave1, on='pre_dist', how='inner')

# Append the results to get a single dataset
df_final = pd.concat([merged_wave0, merged_wave1])

# Optionally, save the intermediate and final datasets
df_final.to_csv('final_dataset.csv', index=False)


In [None]:
# Load the dataset
df = pd.read_stata('pause_3_t.dta')  # Adjust the file path as necessary

# Drop irrelevant variables
df.drop(columns=['migcr2', 'migdo', 'migsv', 'might2', 'migjm', 'migmx2', 'migni', 'migpa', 'twodigit', '_merge', 'relate', 'MGRATE5', 'mgratep', 'country', 'ownership', 'electric', 'watsup', 'sewage', 'emp*'], inplace=True)

# Save the dataset
df.to_stata('pause_3_t_cleaned.dta', write_index=False)

# Clean micro-control variables
df['sex'].replace({2: 0, 9: np.nan}, inplace=True)  # Recode sex: 2 (female) to 0, and 9 (missing info) to NaN
df['yrschool'].replace({91: 3, 92: 8, 93: 8, 94: 11, 95: 3, 98: np.nan, 99: np.nan}, inplace=True)  # Clean yrschool
df['urban'].replace({1: 0, 2: 1, 9: np.nan}, inplace=True)  # Recode urban: 1 (rural) to 0, 2 (urban) to 1, and 9 (missing info) to NaN

# Drop rows with missing values in sex or yrschool
df.dropna(subset=['sex', 'yrschool'], inplace=True)

# Create a binary variable for primary education
df['primary_edu'] = (df['yrschool'] >= 6).astype(int)

# Save the cleaned dataset
df.to_stata('pause_3_t_cleaned.dta', write_index=False)
# Define province capitals for each country
costa_rica_capitals = [188001001, 188002001, 188003001, 188004001, 188005002, 188006001, 188007002]
dominican_republic_capitals = [101, 201, 301, 401, 501, 601, 701, 801, 901, 1001, 1101, 1201, 1301, 1401, 1501, 1601, 1701, 1801, 1901, 2001, 2101, 2201, 2301, 2401, 2501, 2601, 2701, 2801, 2901, 3001, 3101, 3201]
el_salvador_capitals = [222001001, 222002001, 222003001, 222004004, 222007002, 222008001, 222013006, 222006001, 222010001]
haiti_capitals = [332006005, 332006009, 332007004, 332003001, 332003006, 332009001, 332006001, 332006003, 332007002, 332007001]

# Replace capital values for each country
df.loc[(df['cntry'] == 188) & (df['geo2_cr'].isin(costa_rica_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 188) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 188) & (df['geo2_cr'] == 188001001), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 188) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

df.loc[(df['cntry'] == 214) & (df['geo2_dox'].isin(dominican_republic_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 214) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 214) & (df['geo2_dox'] == 3201), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 214) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

df.loc[(df['cntry'] == 222) & (df['geo2_sv'].isin(el_salvador_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 222) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 222) & (df['geo2_sv'] == 222006001), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 222) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

df.loc[(df['cntry'] == 332) & (df['geo2_ht'].isin(haiti_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 332) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 332) & (df['geo2_ht'] == 332006001), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 332) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

# Define province capitals for Mexico
mexico_capitals = [484001001, 484002002, 484015067, 484004002, 484007100, 484008018, 484005030, 484006002, 484010005, 484011015, 484012029, 484013048, 484014039, 484015100, 484016051, 484017007, 484019039, 484032053, 484031050, 484020067, 484029032, 484021114, 484028038, 484022014, 484024028, 484025006, 484026030]

# Replace capital values for Mexico
df.loc[(df['cntry'] == 484) & (df['geo2_mx'].isin(mexico_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 484) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 484) & (df['geolev1'] == 484009), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 484) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

# Define province capitals for Nicaragua
nicaragua_capitals = [5010, 7510, 3045, 6510, 3570, 7015, 1035, 3540, 2005, 5525, 6010, 545, 8040, 8520, 9055]

# Replace capital values for Nicaragua
df.loc[(df['cntry'] == 558) & (df['geo2_nix'].isin(nicaragua_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 558) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 558) & (df['geo2_nix'] == 5525), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 558) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0

# Define province capitals for Panama
panama_capitals = [591004001, 591004002, 591002001, 591003001, 591006001, 591008001, 591008003]

# Replace capital values for Panama
df.loc[(df['cntry'] == 591) & (df['geo2_pa'].isin(panama_capitals)), 'capital'] = 1
df.loc[(df['cntry'] == 591) & (df['capital'].isna()), 'capital'] = 0
df.loc[(df['cntry'] == 591) & (df['geo2_pa'] == 591008001), 'capital_cntry'] = 1
df.loc[(df['cntry'] == 591) & (df['capital_cntry'].isna()), 'capital_cntry'] = 0


In [None]:
# Read macro data from Excel file
macro_df = pd.read_excel('macro_cntry.xlsx', header=0)

# Sort by country and year
macro_df.sort_values(by=['cntry', 'year'], inplace=True)

# Group by country and assign a unique number to each group
macro_df['ke'] = macro_df.groupby('cntry').cumcount() + 1

# Keep only the first row for each country
macro_df = macro_df[macro_df['ke'] == 1]

# Rename columns
macro_df.rename(columns={'GDP_percap': 'GDP_ti', 'Inflation': 'Inf_ti', 
                         'Dev_Assistance': 'Dev_ti', 'gini': 'gini_ti'}, inplace=True)

# Drop the 'ke' column
macro_df.drop(columns=['ke'], inplace=True)

# Define the list of macro variables
macro_vars = ['GDP_ti', 'Inf_ti', 'Dev_ti', 'gini_ti']

# Iterate over macro variables
for j in macro_vars:
    # Calculate quartiles
    macro_df[f'above_{j}'] = pd.qcut(macro_df[j], q=4, labels=False)
    # Create a binary variable
    macro_df[f'{j}b'] = 0
    macro_df.loc[macro_df[f'above_{j}'] <= 3, f'{j}b'] = 0
    macro_df.loc[macro_df[f'above_{j}'] == 4, f'{j}b'] = 1

# Reorder columns
macro_df = macro_df[['cntry'] + [f'{var}b' for var in macro_vars]]

# Save the macro data
macro_df.to_stata('macro_cntry_b.dta', write_index=False)

# Read the main dataset
main_df = pd.read_stata('pause_4_drought_win6.dta')

# Sort by country and year
main_df.sort_values(by=['cntry', 'year'], inplace=True)

# Merge the main dataset with the macro data
merged_df = pd.merge(main_df, macro_df, on='cntry', how='inner', validate='many_to_one')

# Keep only the matched rows
merged_df.dropna(subset=macro_vars, inplace=True)

# Drop unnecessary columns
merged_df.drop(columns=['_merge'] + [f'{var}b' for var in macro_vars] + ['Rem_ti', 'Rem_tib'], inplace=True)

# Save the merged dataset
merged_df.to_stata('pause_5_drought_win6_merged.dta', write_index=False)


In [None]:
# Load the dataset
df = pd.read_stata('pause_5_1216_t.dta')

# Create three-way interaction terms
df['int_wav_coh_1525'] = df['aff_area'] * df['wave'] * df['aff_cohort_15_25']
df['int_wav_coh_2635'] = df['aff_area'] * df['wave'] * df['aff_cohort_26_35']
df['int_wav_coh_3645'] = df['aff_area'] * df['wave'] * df['aff_cohort_36_45']
df['int_wav_coh_4655'] = df['aff_area'] * df['wave'] * df['aff_cohort_46_55']

# Create two-way interaction terms
df['int_wav'] = df['aff_area'] * df['wave']
df['int_coh_1525'] = df['aff_area'] * df['aff_cohort_15_25']
df['int_coh_2635'] = df['aff_area'] * df['aff_cohort_26_35']
df['int_coh_3645'] = df['aff_area'] * df['aff_cohort_36_45']
df['int_coh_4655'] = df['aff_area'] * df['aff_cohort_46_55']
df['wav_coh_1525'] = df['wave'] * df['aff_cohort_15_25']
df['wav_coh_2635'] = df['wave'] * df['aff_cohort_26_35']
df['wav_coh_3645'] = df['wave'] * df['aff_cohort_36_45']
df['wav_coh_4655'] = df['wave'] * df['aff_cohort_46_55']

# Save the dataset
df.to_stata('pause_5_1216_t.dta', write_index=False)

# Load the second dataset
df2 = pd.read_stata('z_withmex.dta')

# Rename the 'dist' column to 'geolev1'
df2.rename(columns={'dist': 'geolev1'}, inplace=True)

# Replace values in 'geolev1' column to match with the first dataset
df2['geolev1'].replace({591007: 591008, 591006: 591007, 591005: 591006, 591004: 591005,
                        591003: 591004, 591002: 591003, 591001: 591002}, inplace=True)

# Sort the dataframe by 'geolev1'
df2.sort_values(by='geolev1', inplace=True)

# Keep only specific rows based on country and year
df2 = df2[(df2['country'] == 188) & (df2['year'] == 2010) |
          (df2['country'] == 214) & (df2['year'] == 2010) |
          (df2['country'] == 222) & (df2['year'] == 2007) |
          (df2['country'] == 332) & (df2['year'] == 2003) |
          (df2['country'] == 388) & (df2['year'] == 2001) |
          (df2['country'] == 484) & (df2['year'] == 2010) |
          (df2['country'] == 558) & (df2['year'] == 2005) |
          (df2['country'] == 591) & (df2['year'] == 2010)]

# Save the modified dataset
df2.to_stata('z_readytomerge.dta', write_index=False)

import pandas as pd

# Read the dataset pause_5_1216_t
pause_df = pd.read_stata('pause_5_1216_t.dta')

# Sort by geolev1
pause_df.sort_values(by='geolev1', inplace=True)

# Drop the _merge column if it exists
pause_df.drop(columns='_merge', errors='ignore', inplace=True)

# Merge with z_readytomerge
z_ready_df = pd.read_stata('z_readytomerge.dta')
merged_df = pd.merge(pause_df, z_ready_df, on='geolev1', how='inner', validate='many_to_one')
merged_df = merged_df[merged_df['_merge'] == 3]

# Define a list of values for iteration
values = ["99", "95", "90"]

# Define a list of variables for interaction terms
interaction_vars = ['wave', 'aff_cohort_15_25', 'aff_cohort_26_35', 'aff_cohort_36_45', 'aff_cohort_46_55']

# Iterate over values and generate interaction terms
for i in values:
    for var in interaction_vars:
        merged_df[f'z{i}_{var}'] = merged_df[f'z_{i}'] * merged_df[var]
        merged_df[f'z{i}2_{var}'] = merged_df[f'z_{i}'] ** 2 * merged_df[var]

# Save the dataset
merged_df.to_stata('aerpp_ready_withmex.dta', write_index=False)


In [None]:
# Load the Excel files for Costa Rica and El Salvador
costa_rica_df = pd.read_excel("GRP predictions_Costa Rica.xlsx", sheet_name="admin1", header=0)
el_salvador_df = pd.read_excel("GRP predictions_El Salvador.xlsx", sheet_name="admin1", header=0)

# Drop unnecessary columns
costa_rica_df.drop(columns=["Official GDP1", "GDP2", "GDP3", "GDP4", "GDP5", "GDP6"], inplace=True)
el_salvador_df.drop(columns=["Official GDP1", "GDP2", "GDP3", "GDP4", "GDP5", "GDP6"], inplace=True)

# Define mapping for Costa Rica
costa_rica_mapping = {
    "San Jose": 188001,
    "Alajuela": 188002,
    "Cartago": 188003,
    "Heredia": 188004,
    "Guanacaste": 188005,
    "Puntarenas": 188006,
    "Limon": 188007
}

# Define mapping for El Salvador
el_salvador_mapping = {
    "Ahuachapan": 222001,
    "Santa Ana": 222002,
    "Sonsonate": 222003,
    "Chalatenango": 222004,
    "La Libertad": 222005,
    "San Salvador": 222006,
    "Cuscatlan": 222007,
    "La Paz": 222008,
    "Cabanas": 222009,
    "San Vicente": 222010,
    "Usulutan": 222011,
    "San Miguel": 222012,
    "Morazan": 222013,
    "La Union": 222014
}

# Apply the mappings to create the 'dist' column
costa_rica_df['dist'] = costa_rica_df['sub_level_name'].map(costa_rica_mapping)
el_salvador_df['dist'] = el_salvador_df['sub_level_name'].map(el_salvador_mapping)

# Save the modified DataFrames
costa_rica_df.to_excel("gdp_temp_cos.xlsx", index=False)
el_salvador_df.to_excel("gdp_temp_sv.xlsx", index=False)


In [None]:
import pandas as pd

# Load the Excel file for Nicaragua
nicaragua_df = pd.read_excel("GRP predictions_Nicaragua.xlsx", sheet_name="admin1", header=0)

# Drop unnecessary columns
nicaragua_df.drop(columns=["Official GDP1", "GDP2", "GDP3", "GDP4", "GDP5", "GDP6"], inplace=True)

# Define mapping for Nicaragua
nicaragua_mapping = {
    "Nueva Segovia": 558005,
    "Jinotega": 558010,
    "Madriz": 558020,
    "Chinandega": 558030,
    "Leon": 558035,
    "Esteli": 558035,
    "Matagalpa": 558040,
    "Boaco": 558050,
    "Managua": 558055,
    "Masaya": 558060,
    "Chontales": 558065,
    "Granada": 558070,
    "Carazo": 558075,
    "Rivas": 558080,
    "Rio San Juan": 558085,
    "Atlantico Norte": 558093,
    "Atlantico Sur": 558093
}

# Apply the mapping to create the 'dist' column
nicaragua_df['dist'] = nicaragua_df['sub_level_name'].map(nicaragua_mapping)

# Drop rows where 'dist' is NaN
nicaragua_df.dropna(subset=['dist'], inplace=True)

# Save the modified DataFrame
nicaragua_df.to_excel("gdp_temp_nic.xlsx", index=False)


In [None]:
# Load the Excel file for Panama
panama_df = pd.read_excel("GRP predictions_Panama.xlsx", sheet_name="admin1", header=0)

# Drop unnecessary columns
panama_df.drop(columns=["Official GDP1", "GDP2", "GDP3", "GDP4", "GDP5", "GDP6"], inplace=True)

# Define mapping for Panama
panama_mapping = {
    "Cocle": 591001,
    "Colon": 591002,
    "Bocas de Toro": 591003,
    "Chiriqui": 591003,
    "Ngabe Bugle": 591003,
    "Veraguas": 591003,
    "Embera": 591004,
    "Darien": 591004,
    "Herrera": 591005,
    "Los Santos": 591006,
    "Panama": 591007
}

# Apply the mapping to create the 'dist' column
panama_df['dist'] = panama_df['sub_level_name'].map(panama_mapping)

# Drop rows where 'dist' is NaN
panama_df.dropna(subset=['dist'], inplace=True)

# Save the modified DataFrame
panama_df.to_excel("gdp_temp_pan.xlsx", index=False)


In [None]:
# Load GDP data for Costa Rica
gdp_temp_cos = pd.read_stata("gdp_temp_cos.dta")

# Load GDP data for El Salvador
gdp_temp_sv = pd.read_stata("gdp_temp_sv.dta")

# Load GDP data for Nicaragua
gdp_temp_nic = pd.read_stata("gdp_temp_nic.dta")

# Load GDP data for Panama
gdp_temp_pan = pd.read_stata("gdp_temp_pan.dta")

# Append the GDP dataframes
gdp_temp = pd.concat([gdp_temp_cos, gdp_temp_sv, gdp_temp_nic, gdp_temp_pan], ignore_index=True)

# Sort the combined dataframe
gdp_temp.sort_values(by=['dist', 'year'], inplace=True)

# Save the combined dataframe
gdp_temp.to_stata("gdp_temp.dta", write_index=False)

# Load the weather data
merra_final_withmex = pd.read_stata("merra_final_withmex.dta")

# Filter data for specific countries and years
merra_final_withmex = merra_final_withmex[merra_final_withmex['country'].isin([188, 222, 558, 591])]
merra_final_withmex = merra_final_withmex[merra_final_withmex['year'].isin([2000, 2005, 2010])]

# Calculate mean temperature by district and year
merra_final_withmex['t_yearmean'] = merra_final_withmex.groupby(['dist', 'year'])['daily_temp'].transform('mean')

# Resetting index to make it sequential
merra_final_withmex.reset_index(drop=True, inplace=True)

# Merge weather data with GDP data
merged_data = pd.merge_asof(merra_final_withmex.sort_values('year'), gdp_temp.sort_values('year'), by='dist', on='year')

# Keep only matched rows
merged_data.dropna(subset=['GDP1predicted'], inplace=True)

# Resetting index to make it sequential
merged_data.reset_index(drop=True, inplace=True)

# Adjusting district codes
merged_data['dist'].replace({
    5910010: 591001,
    591008: 591007,
    591007: 591005,
    591006: 591004,
    591004: 591003,
    591003: 591002,
    591002: 591001
}, inplace=True)

# Merge with additional data from 'z_withmex' and 'cruts_5_rain_aerpp_gdp' datasets
# Code for merging additional datasets is not provided and would depend on their structure

# Perform regression analysis
# The regression analysis part involves multiple steps and is not directly translatable to Python code without more context,
# such as the structure of the dataset and the specific regression models being used.

# Save the final dataset
merged_data.to_stata("final_dataset.dta", write_index=False)


In [None]:
# Load GDP data from WDI
gdp_wdi = pd.read_excel("gdp_wdi.xls", sheet_name="Sheet1")

# Convert GDP columns to numeric
for i in range(1, 7):
    gdp_wdi[f'GDP{i}'] = pd.to_numeric(gdp_wdi[f'GDP{i}'], errors='coerce')

# Sort by country and year
gdp_wdi.sort_values(by=['country', 'year'], inplace=True)

# Save the processed GDP data
gdp_wdi.to_stata("gdp_long_dependent.dta", write_index=False)

# Load data from z_withmex
z_withmex = pd.read_stata("z_withmex.dta")

# Calculate mean values of z scores by country and year
z_country_mean = z_withmex.groupby(['country', 'year']).agg({
    'z_99': 'mean',
    'z_95': 'mean',
    'z_90': 'mean'
}).reset_index()

# Save the aggregated z scores by country and year
z_country_mean.to_stata("z_withmex_cntry.dta", write_index=False)

# Load data from cruts_5_rain_aerpp_gdp_long
cruts_5_rain_aerpp_gdp_long = pd.read_stata("cruts_5_rain_aerpp_gdp_long.dta")

# Calculate mean values of 5-year rainfall by country and year
rain_5yr_country_mean = cruts_5_rain_aerpp_gdp_long.groupby(['country', 'year'])['rain_5yr'].mean().reset_index()

# Save the aggregated 5-year rainfall by country and year
rain_5yr_country_mean.to_stata("cruts_5_rain_aerpp_gdp_cntry.dta", write_index=False)


In [None]:
# Load GDP data
gdp_long_dependent = pd.read_stata("gdp_long_dependent.dta")

# Load z scores data aggregated by country and year
z_withmex_cntry = pd.read_stata("z_withmex_cntry.dta")

# Merge GDP data with z scores data
gdp_merged = pd.merge(gdp_long_dependent, z_withmex_cntry, on=['country', 'year'], how='inner')

# Load 5-year rainfall data aggregated by country and year
cruts_5_rain_aerpp_gdp_cntry = pd.read_stata("cruts_5_rain_aerpp_gdp_cntry.dta")

# Merge GDP data with 5-year rainfall data
gdp_merged = pd.merge(gdp_merged, cruts_5_rain_aerpp_gdp_cntry, on=['country', 'year'], how='inner')

# Convert GDP columns to billions
gdp_merged['GDP1'] /= 10**9
gdp_merged['GDP2'] /= 10**9
gdp_merged['GDP3'] /= 10**9
gdp_merged['GDP6'] /= 10**9

# Replace negative or zero values in z scores with 0
gdp_merged['z_99_cntry'].clip(lower=0, inplace=True)
gdp_merged['z_95_cntry'].clip(lower=0, inplace=True)
gdp_merged['z_90_cntry'].clip(lower=0, inplace=True)

# Calculate squared values of 5-year rainfall
gdp_merged['rain_5yr_cntry2'] = gdp_merged['rain_5yr_cntry'] ** 2

# Generate dummy variables for countries
gdp_merged = pd.get_dummies(gdp_merged, columns=['country'], prefix='cntry')

# Generate interaction terms of country and year
for i in range(1, 5):
    gdp_merged[f'cntry_year{i}'] = gdp_merged[f'cntry_{i}'] * gdp_merged['year']

# Calculate log GDP
gdp_merged['loggdp6'] = np.log(gdp_merged['GDP6'])

# Perform regression
from linearmodels.panel import PanelOLS
import statsmodels.api as sm

varlist = ['loggdp6']
zlist = ['z_90_cntry']

for var in varlist:
    for z in zlist:
        exog_vars = [z, 'rain_5yr_cntry', 'rain_5yr_cntry2', 'country', 'year']
        mod = PanelOLS.from_formula(f"{var} ~ {z} + rain_5yr_cntry + rain_5yr_cntry2 + country + year", data=gdp_merged)
        res = mod.fit(cov_type='clustered', cluster_entity=True)
        res.summary().to_excel("gdp_WDI_final_log.xls")


In [None]:

# Load the dataset
aerpp_ready_withmex = pd.read_stata("aerpp_ready_withmex.dta")

# Filter out data for country 484
aerpp_ready_nomex = aerpp_ready_withmex[aerpp_ready_withmex['cntry'] != 484]

# Define the variables
variables = ['z_99', 'z_95', 'z_90', 'rolling_dist_99', 'rolling_dist_95', 'rolling_dist_90']

# Create histograms and density plots
for var in variables:
    plt.figure(figsize=(10, 5))
    plt.hist(aerpp_ready_withmex[var], bins=20, alpha=0.5, label='With Mexico', density=True)
    plt.hist(aerpp_ready_nomex[var], bins=20, alpha=0.5, label='Without Mexico', density=True)
    plt.title(f'Histogram and Density Plot for {var}')
    plt.xlabel(var)
    plt.ylabel('Density')
    plt.legend()
    plt.savefig(f'hist_density_{var}.png')
    plt.show()


In [None]:
# Load the dataset
data = pd.read_stata("your_dataset.dta")

# Define the variables
variables = ['z_99', 'z_95', 'z_90', 'rolling_dist_99', 'rolling_dist_95', 'rolling_dist_90']

# Polynomial regression function
def poly_regression(x, y, degree):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(x.reshape(-1, 1))
    poly.fit(X_poly, y)
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y)
    return lin_reg.predict(X_poly)

# Create polynomial regression plots
for var in variables:
    plt.figure(figsize=(10, 5))
    plt.scatter(data[var], data['migrate'], color='blue', label='Actual Data')
    plt.plot(np.sort(data[var]), poly_regression(data[var], data['migrate'], 3)[np.argsort(data[var])], color='red', label='Polynomial Regression (Degree 3)')
    plt.title(f'Intensity vs. Migrate for {var}')
    plt.xlabel(var)
    plt.ylabel('Migrate')
    plt.legend()
    plt.savefig(f'lpoly_{var}.png')
    plt.show()


In [None]:

# Load the dataset
data = pd.read_stata("aerpp_ready_withmex.dta")

# Filter the data to remove rows where z_90 <= 0
data_filtered = data[data['z_90'] > 0]

# Set the style
sns.set(style="whitegrid")

# Create the density plot for z_90
plt.figure(figsize=(10, 6))
sns.kdeplot(data_filtered['z_90'], shade=True, color="b", label="z_90 Density Plot")
plt.axvline(x=0.80, linestyle='--', color='r', label='PCI 80%')
plt.axvline(x=1.64, linestyle='--', color='g', label='PCI 90%')
plt.xlabel("z_90")
plt.ylabel("Density")
plt.title("Density Plot for Standardized Intensity (z_90)")
plt.legend()
plt.savefig("density_plot_z90.png")
plt.show()


In [None]:

# Load the data into pandas DataFrame
data = pd.read_csv("your_data.csv")

# Define the core variables and micro control variables
core_vars = ["int99_0_wav_coh_1525", "int99_0_wav_coh_2635", "int99_0_wav_coh_3645", "int99_0_wav_coh_4655",
             "int99_0_wav", "int99_0_coh_1525", "int99_0_coh_2635", "int99_0_coh_3645", "int99_0_coh_4655",
             "wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "int99_0", "wave", "aff_cohort"]

micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

# Iterate over each combination of core variables and micro control variables
for core_var in core_vars:
    for micro_control_var in micro_control_vars:
        # Filter the data if necessary (exclude country 388)
        filtered_data = data[data['cntry'] != 388]
        
        # Define the independent variables
        X = filtered_data[[core_var] + micro_control_vars]
        
        # Add constant to the independent variables
        X = sm.add_constant(X)
        
        # Define the dependent variable
        y = filtered_data['migrate']
        
        # Fit the OLS regression model
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})
        
        # Print the regression summary
        print(model.summary())
        
        # Save the regression results
        with open("regression_results.txt", "a") as f:
            f.write(str(model.summary()))


In [None]:


# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Define the core variables and micro control variables
core_vars = ["int99_0_wav_coh_1525", "int99_0_wav_coh_2635", "int99_0_wav_coh_3645", "int99_0_wav_coh_4655",
             "int99_0_wav", "int99_0_coh_1525", "int99_0_coh_2635", "int99_0_coh_3645", "int99_0_coh_4655",
             "wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "int99_0", "wave", "aff_cohort"]

micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

# Iterate over each combination of core variables and micro control variables
for core_var in core_vars:
    for micro_control_var in micro_control_vars:
        # Filter the data if necessary (exclude country 388 and select only males)
        filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 1)]
        
        # Define the independent variables
        X = filtered_data[[core_var] + micro_control_vars]
        
        # Add a constant to the independent variables
        X = sm.add_constant(X)
        
        # Define the dependent variable
        y = filtered_data['migrate']
        
        # Fit the OLS regression model
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})
        
        # Print the regression summary
        print(model.summary())
        
        # Save the regression results
        with open("1st_set_male.xls", "a") as f:
            f.write(str(model.summary()))


In [None]:

# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Define the core variables and micro control variables
core_vars = ["int99_0_wav_coh_1525", "int99_0_wav_coh_2635", "int99_0_wav_coh_3645", "int99_0_wav_coh_4655",
             "int99_0_wav", "int99_0_coh_1525", "int99_0_coh_2635", "int99_0_coh_3645", "int99_0_coh_4655",
             "wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "int99_0", "wave", "aff_cohort"]

micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

# Iterate over each combination of core variables and micro control variables
for core_var in core_vars:
    for micro_control_var in micro_control_vars:
        # Filter the data if necessary (exclude country 388 and select only females)
        filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 0)]
        
        # Define the independent variables
        X = filtered_data[[core_var] + micro_control_vars]
        
        # Add a constant to the independent variables
        X = sm.add_constant(X)
        
        # Define the dependent variable
        y = filtered_data['migrate']
        
        # Fit the OLS regression model
        model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})
        
        # Print the regression summary
        print(model.summary())
        
        # Save the regression results
        with open("1st_set_female.xls", "a") as f:
            f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Iterate over each combination of core variables and micro control variables
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_vars = ["int{}_{}_wav_coh_1525".format(i, j),
                     "int{}_{}_wav_coh_2635".format(i, j),
                     "int{}_{}_wav_coh_3645".format(i, j),
                     "int{}_{}_wav_coh_4655".format(i, j),
                     "int{}_{}_wav".format(i, j),
                     "int{}_{}_coh_1525".format(i, j),
                     "int{}_{}_coh_2635".format(i, j),
                     "int{}_{}_coh_3645".format(i, j),
                     "int{}_{}_coh_4655".format(i, j),
                     "wav_coh_1525",
                     "wav_coh_2635",
                     "wav_coh_3645",
                     "wav_coh_4655",
                     "int{}_{}".format(i, j),
                     "wave",
                     "aff_cohort_*"]
        
        micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for core_var in core_vars:
            for micro_control_var in micro_control_vars:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['primary_edu'] == 1)]

                # Define the independent variables
                X = filtered_data[[core_var] + [micro_control_var] + ['pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_pooled_skilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:


# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Iterate over each combination of core variables and micro control variables
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_vars = ["int{}_{}_wav_coh_1525".format(i, j),
                     "int{}_{}_wav_coh_2635".format(i, j),
                     "int{}_{}_wav_coh_3645".format(i, j),
                     "int{}_{}_wav_coh_4655".format(i, j),
                     "int{}_{}_wav".format(i, j),
                     "int{}_{}_coh_1525".format(i, j),
                     "int{}_{}_coh_2635".format(i, j),
                     "int{}_{}_coh_3645".format(i, j),
                     "int{}_{}_coh_4655".format(i, j),
                     "wav_coh_1525",
                     "wav_coh_2635",
                     "wav_coh_3645",
                     "wav_coh_4655",
                     "int{}_{}".format(i, j),
                     "wave",
                     "aff_cohort_*"]
        
        micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for core_var in core_vars:
            for micro_control_var in micro_control_vars:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['primary_edu'] == 1)]

                # Define the independent variables
                X = filtered_data[[core_var] + [micro_control_var] + ['pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_pooled_skilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Iterate over each combination of core variables and micro control variables
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_vars = ["int{}_{}_wav_coh_1525".format(i, j),
                     "int{}_{}_wav_coh_2635".format(i, j),
                     "int{}_{}_wav_coh_3645".format(i, j),
                     "int{}_{}_wav_coh_4655".format(i, j),
                     "int{}_{}_wav".format(i, j),
                     "int{}_{}_coh_1525".format(i, j),
                     "int{}_{}_coh_2635".format(i, j),
                     "int{}_{}_coh_3645".format(i, j),
                     "int{}_{}_coh_4655".format(i, j),
                     "wav_coh_1525",
                     "wav_coh_2635",
                     "wav_coh_3645",
                     "wav_coh_4655",
                     "int{}_{}".format(i, j),
                     "wave",
                     "aff_cohort_*"]
        
        micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for core_var in core_vars:
            for micro_control_var in micro_control_vars:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 1) & (data['primary_edu'] == 1)]

                # Define the independent variables
                X = filtered_data[[core_var] + [micro_control_var] + ['pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_male_skilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_csv("your_data.csv")

# Iterate over each combination of core variables and micro control variables
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_vars = ["int{}_{}_wav_coh_1525".format(i, j),
                     "int{}_{}_wav_coh_2635".format(i, j),
                     "int{}_{}_wav_coh_3645".format(i, j),
                     "int{}_{}_wav_coh_4655".format(i, j),
                     "int{}_{}_wav".format(i, j),
                     "int{}_{}_coh_1525".format(i, j),
                     "int{}_{}_coh_2635".format(i, j),
                     "int{}_{}_coh_3645".format(i, j),
                     "int{}_{}_coh_4655".format(i, j),
                     "wav_coh_1525",
                     "wav_coh_2635",
                     "wav_coh_3645",
                     "wav_coh_4655",
                     "int{}_{}".format(i, j),
                     "wave",
                     "aff_cohort_*"]
        
        micro_control_vars = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for core_var in core_vars:
            for micro_control_var in micro_control_vars:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 0) & (data['primary_edu'] == 1)]

                # Define the independent variables
                X = filtered_data[[core_var] + [micro_control_var] + ['pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_female_skilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data.dta")

# Iterate over each combination of i and j
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_var = ["int{}_{}_wav_coh_1525".format(i, j),
                    "int{}_{}_wav_coh_2635".format(i, j),
                    "int{}_{}_wav_coh_3645".format(i, j),
                    "int{}_{}_wav_coh_4655".format(i, j),
                    "int{}_{}_wav".format(i, j),
                    "int{}_{}_coh_1525".format(i, j),
                    "int{}_{}_coh_2635".format(i, j),
                    "int{}_{}_coh_3645".format(i, j),
                    "int{}_{}_coh_4655".format(i, j),
                    "wav_coh_1525",
                    "wav_coh_2635",
                    "wav_coh_3645",
                    "wav_coh_4655",
                    "int{}_{}".format(i, j),
                    "wave",
                    "aff_cohort_*"]
        
        micro_control_var = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for var in core_var:
            for micro_var in micro_control_var:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['primary_edu'] == 0)]

                # Define the independent variables
                X = filtered_data[[var, micro_var, 'pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_pooled_unskilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data.dta")

# Iterate over each combination of i and j
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_var = ["int{}_{}_wav_coh_1525".format(i, j),
                    "int{}_{}_wav_coh_2635".format(i, j),
                    "int{}_{}_wav_coh_3645".format(i, j),
                    "int{}_{}_wav_coh_4655".format(i, j),
                    "int{}_{}_wav".format(i, j),
                    "int{}_{}_coh_1525".format(i, j),
                    "int{}_{}_coh_2635".format(i, j),
                    "int{}_{}_coh_3645".format(i, j),
                    "int{}_{}_coh_4655".format(i, j),
                    "wav_coh_1525",
                    "wav_coh_2635",
                    "wav_coh_3645",
                    "wav_coh_4655",
                    "int{}_{}".format(i, j),
                    "wave",
                    "aff_cohort_*"]
        
        micro_control_var = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for var in core_var:
            for micro_var in micro_control_var:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 1) & (data['primary_edu'] == 0)]

                # Define the independent variables
                X = filtered_data[[var, micro_var, 'pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_male_unskilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data.dta")

# Iterate over each combination of i and j
for i in ["99", "95", "90"]:
    for j in ["0"]:
        core_var = ["int{}_{}_wav_coh_1525".format(i, j),
                    "int{}_{}_wav_coh_2635".format(i, j),
                    "int{}_{}_wav_coh_3645".format(i, j),
                    "int{}_{}_wav_coh_4655".format(i, j),
                    "int{}_{}_wav".format(i, j),
                    "int{}_{}_coh_1525".format(i, j),
                    "int{}_{}_coh_2635".format(i, j),
                    "int{}_{}_coh_3645".format(i, j),
                    "int{}_{}_coh_4655".format(i, j),
                    "wav_coh_1525",
                    "wav_coh_2635",
                    "wav_coh_3645",
                    "wav_coh_4655",
                    "int{}_{}".format(i, j),
                    "wave",
                    "aff_cohort_*"]
        
        micro_control_var = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

        for var in core_var:
            for micro_var in micro_control_var:
                # Filter the data
                filtered_data = data[(data['cntry'] != 388) & (data['sex'] == 0) & (data['primary_edu'] == 0)]

                # Define the independent variables
                X = filtered_data[[var, micro_var, 'pre_dist']]
                X = sm.add_constant(X)

                # Define the dependent variable
                y = filtered_data['migrate']

                # Fit the OLS regression model
                model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': filtered_data['c2']})

                # Print the regression summary
                print(model.summary())

                # Save the regression results
                with open("1st_set_female_unskilled.xls", "a") as f:
                    f.write(str(model.summary()))


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("aerpp_ready_withmex.dta")

# Generate new variables int99_0, int95_0, int90_0, int99_1, int95_1, int90_1
data['int99_0'] = data['z_99']
data.loc[data['z_99'] <= 0, 'int99_0'] = 0
data['int95_0'] = data['z_95']
data.loc[data['z_95'] <= 0, 'int95_0'] = 0
data['int90_0'] = data['z_90']
data.loc[data['z_90'] <= 0, 'int90_0'] = 0
data['int99_1'] = data['z_99']
data.loc[data['z_99'] <= 1, 'int99_1'] = 0
data['int95_1'] = data['z_95']
data.loc[data['z_95'] <= 1, 'int95_1'] = 0
data['int90_1'] = data['z_90']
data.loc[data['z_90'] <= 1, 'int90_1'] = 0

# Iterate over each combination of i and j
for i in ["99", "95", "90"]:
    for j in ["0", "1"]:
        # Generate three-way variables
        data[f'int{i}_{j}_wav_coh_1525'] = data[f'int{i}_{j}'] * data['wave'] * data['aff_cohort_15_25']
        data[f'int{i}_{j}_wav_coh_2635'] = data[f'int{i}_{j}'] * data['wave'] * data['aff_cohort_26_35']
        data[f'int{i}_{j}_wav_coh_3645'] = data[f'int{i}_{j}'] * data['wave'] * data['aff_cohort_36_45']
        data[f'int{i}_{j}_wav_coh_4655'] = data[f'int{i}_{j}'] * data['wave'] * data['aff_cohort_46_55']
        
        # Generate two-way variables
        data[f'int{i}_{j}_wav'] = data[f'int{i}_{j}'] * data['wave']
        data[f'int{i}_{j}_coh_1525'] = data[f'int{i}_{j}'] * data['aff_cohort_15_25']
        data[f'int{i}_{j}_coh_2635'] = data[f'int{i}_{j}'] * data['aff_cohort_26_35']
        data[f'int{i}_{j}_coh_3645'] = data[f'int{i}_{j}'] * data['aff_cohort_36_45']
        data[f'int{i}_{j}_coh_4655'] = data[f'int{i}_{j}'] * data['aff_cohort_46_55']

# Generate to_capital, to_noncapital, to_capital_cntry, and to_noncapital_cntry variables
data['to_capital'] = 0
data.loc[(data['migrate'] == 1) & (data['capital'] == 1), 'to_capital'] = 1

data['to_noncapital'] = 0
data.loc[(data['migrate'] == 1) & (data['capital'] == 0), 'to_noncapital'] = 1

data['to_capital_cntry'] = 0
data.loc[(data['migrate'] == 1) & (data['capital_cntry'] == 1), 'to_capital_cntry'] = 1

data['to_noncapital_cntry'] = 0
data.loc[(data['migrate'] == 1) & (data['capital_cntry'] == 0), 'to_noncapital_cntry'] = 1

# Save the modified DataFrame to a new Stata file
data.to_stata("modified_data.dta")


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data_file.dta")

# Create an empty DataFrame to store the results
results = pd.DataFrame(index=["Mean Migration Rate", "Mean to Capital Country", "Mean to Capital", "Mean Migration Rate (Female)", 
                               "Mean to Capital Country (Female)", "Mean to Capital (Female)"],
                       columns=["Female", "Male"])

# Calculate mean values for different groups
female_data = data[data['sex'] == 0]  # Filter female data
male_data = data[data['sex'] == 1]    # Filter male data

# Mean migration rates and temperature values for females
results.loc["Mean Migration Rate", "Female"] = female_data['int90_0'].mean()
results.loc["Mean to Capital Country", "Female"] = female_data.loc[female_data['to_capital_cntry'].notnull(), 'int90_0'].mean()
results.loc["Mean to Capital", "Female"] = female_data.loc[female_data['to_capital'].notnull(), 'int90_0'].mean()

# Mean migration rates and temperature values for males
results.loc["Mean Migration Rate", "Male"] = male_data['int90_0'].mean()
results.loc["Mean to Capital Country", "Male"] = male_data.loc[male_data['to_capital_cntry'].notnull(), 'int90_0'].mean()
results.loc["Mean to Capital", "Male"] = male_data.loc[male_data['to_capital'].notnull(), 'int90_0'].mean()

# Print the results
print(results)


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data_file.dta")

# Create an empty DataFrame to store the results
results = pd.DataFrame(index=["Mean Migration Rate (Primary Education)", "Mean to Capital Country (Primary Education)", "Mean to Capital (Primary Education)",
                               "Mean Migration Rate (Non-Primary Education)", "Mean to Capital Country (Non-Primary Education)", "Mean to Capital (Non-Primary Education)"],
                       columns=["Female", "Male"])

# Calculate mean values for different groups
female_data = data[data['sex'] == 0]  # Filter female data
male_data = data[data['sex'] == 1]    # Filter male data

# Mean migration rates and temperature values for females with primary education
results.loc["Mean Migration Rate (Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] == 0) & (female_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital Country (Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] == 0) & (female_data['to_capital_cntry'].notnull()) & (female_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital (Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] == 0) & (female_data['to_capital'].notnull()) & (female_data['cntry'] != 388), 'int90_0'].mean()

# Mean migration rates and temperature values for males with primary education
results.loc["Mean Migration Rate (Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] == 0) & (male_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital Country (Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] == 0) & (male_data['to_capital_cntry'].notnull()) & (male_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital (Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] == 0) & (male_data['to_capital'].notnull()) & (male_data['cntry'] != 388), 'int90_0'].mean()

# Mean migration rates and temperature values for females with non-primary education
results.loc["Mean Migration Rate (Non-Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] != 0) & (female_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital Country (Non-Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] != 0) & (female_data['to_capital_cntry'].notnull()) & (female_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital (Non-Primary Education)", "Female"] = female_data.loc[(female_data['primary_edu'] != 0) & (female_data['to_capital'].notnull()) & (female_data['cntry'] != 388), 'int90_0'].mean()

# Mean migration rates and temperature values for males with non-primary education
results.loc["Mean Migration Rate (Non-Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] != 0) & (male_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital Country (Non-Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] != 0) & (male_data['to_capital_cntry'].notnull()) & (male_data['cntry'] != 388), 'int90_0'].mean()
results.loc["Mean to Capital (Non-Primary Education)", "Male"] = male_data.loc[(male_data['primary_edu'] != 0) & (male_data['to_capital'].notnull()) & (male_data['cntry'] != 388), 'int90_0'].mean()

# Print the results
print(results)


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data_file.dta")

# Create an empty DataFrame to store the results
results_primary_edu = pd.DataFrame(index=["Mean Migration Rate (Primary Education)", "Mean to Capital Country (Primary Education)", "Mean to Capital (Primary Education)"],
                                   columns=["Female", "Male"])

# Calculate mean values for different groups with primary education
female_data_primary_edu = data[(data['sex'] == 0) & (data['primary_edu'] == 1) & (data['cntry'] != 388)]  # Filter female data with primary education
male_data_primary_edu = data[(data['sex'] == 1) & (data['primary_edu'] == 1) & (data['cntry'] != 388)]    # Filter male data with primary education

# Mean migration rates and temperature values for females with primary education
results_primary_edu.loc["Mean Migration Rate (Primary Education)", "Female"] = female_data_primary_edu.loc[~female_data_primary_edu['int90_0'].isnull(), 'int90_0'].mean()
results_primary_edu.loc["Mean to Capital Country (Primary Education)", "Female"] = female_data_primary_edu.loc[~female_data_primary_edu['to_capital_cntry'].isnull(), 'int90_0'].mean()
results_primary_edu.loc["Mean to Capital (Primary Education)", "Female"] = female_data_primary_edu.loc[~female_data_primary_edu['to_capital'].isnull(), 'int90_0'].mean()

# Mean migration rates and temperature values for males with primary education
results_primary_edu.loc["Mean Migration Rate (Primary Education)", "Male"] = male_data_primary_edu.loc[~male_data_primary_edu['int90_0'].isnull(), 'int90_0'].mean()
results_primary_edu.loc["Mean to Capital Country (Primary Education)", "Male"] = male_data_primary_edu.loc[~male_data_primary_edu['to_capital_cntry'].isnull(), 'int90_0'].mean()
results_primary_edu.loc["Mean to Capital (Primary Education)", "Male"] = male_data_primary_edu.loc[~male_data_primary_edu['to_capital'].isnull(), 'int90_0'].mean()

# Print the results
print(results_primary_edu)


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data_file.dta")

# Filter the data for males only and excluding country 388
male_data = data[(data['sex'] == 1) & (data['cntry'] != 388)]

# Create an empty DataFrame to store the results
results_male = pd.DataFrame(columns=["Mean", "Standard Deviation"])

# Calculate summary statistics for each variable
variables = ["urban", "migrate", "wave", "aff_cohort_15_25", "aff_cohort_26_35", "aff_cohort_36_45",
             "aff_cohort_46_55", "aff_cohort_56_65", "cruts_5avg", "primary_edu", "to_capital", "to_capital_cntry",
             "wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "int90_0", "int90_0_wav_coh_1525",
             "int90_0_wav_coh_2635", "int90_0_wav_coh_3645", "int90_0_wav_coh_4655", "int90_0_wav",
             "int90_0_coh_1525", "int90_0_coh_2635", "int90_0_coh_3645", "int90_0_coh_4655"]

for variable in variables:
    mean = male_data[variable].mean()
    std_dev = male_data[variable].std()
    results_male.loc[variable] = [mean, std_dev]

# Print the results
print(results_male)


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("your_data_file.dta")

# Filter the data for females only and excluding country 388
female_data = data[(data['sex'] == 0) & (data['cntry'] != 388)]

# Create an empty DataFrame to store the results
results_female = pd.DataFrame(columns=["Mean", "Standard Deviation"])

# Calculate summary statistics for each variable
variables = ["urban", "migrate", "wave", "aff_cohort_15_25", "aff_cohort_26_35", "aff_cohort_36_45",
             "aff_cohort_46_55", "aff_cohort_56_65", "cruts_5avg", "primary_edu", "to_capital", "to_capital_cntry",
             "wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "int90_0", "int90_0_wav_coh_1525",
             "int90_0_wav_coh_2635", "int90_0_wav_coh_3645", "int90_0_wav_coh_4655", "int90_0_wav",
             "int90_0_coh_1525", "int90_0_coh_2635", "int90_0_coh_3645", "int90_0_coh_4655"]

for variable in variables:
    mean = female_data[variable].mean()
    std_dev = female_data[variable].std()
    results_female.loc[variable] = [mean, std_dev]

# Print the results
print(results_female)


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a pandas DataFrame
data = pd.read_stata("aerpp_ready_withmex.dta")

# Define the core and micro control variables
corevar = ["wav_coh_1525", "wav_coh_2635", "wav_coh_3645", "wav_coh_4655", "wave"]
microcvar = ["sex", "primary_edu", "cruts_5avg", "cruts_5avg_sq"]

# Filter data for males
male_data = data[(data['sex'] == 1) & (data['cntry'] != 388)]

# Run regression for males
X_male = sm.add_constant(male_data[corevar + microcvar])
y_male = male_data['migrate']
model_male = sm.WLS(y_male, X_male, weights=male_data['perwt']).fit(cov_type='cluster', cov_kwds={'groups': male_data['c2']})

# Print regression results for males
print(model_male.summary())

# Save regression results for males to Excel
model_male_results = model_male.get_robustcov_results()
model_male_results.summary().tables[1].to_excel("doublediff_final.xls", startrow=0, startcol=0)

# Filter data for females
female_data = data[(data['sex'] == 0) & (data['cntry'] != 388)]

# Run regression for females
X_female = sm.add_constant(female_data[corevar + microcvar])
y_female = female_data['migrate']
model_female = sm.WLS(y_female, X_female, weights=female_data['perwt']).fit(cov_type='cluster', cov_kwds={'groups': female_data['c2']})

# Print regression results for females
print(model_female.summary())

# Save regression results for females to Excel
model_female_results = model_female.get_robustcov_results()
model_female_results.summary().tables[1].to_excel("doublediff_final.xls", startrow=model_male_results.summary().tables[1].shape[0]+2, startcol=0)


In [None]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_stata("aerpp_ready_withmex.dta")

# Generate variables for affdist, aff_cohort_general, and coh_affdist
data['affdist'] = (data['int90_0'] > 0).astype(int)
data['aff_cohort_general'] = ((data['aff_cohort_56_65'] == 1) | (data['aff_cohort_46_55'] == 1) | (data['aff_cohort_36_45'] == 1)).astype(int)
data['coh_affdist'] = data['affdist'] * data['aff_cohort_general']

# Filter data for males and females separately
male_data = data[(data['sex'] == 1) & (data['cntry'] != 388)]
female_data = data[(data['sex'] == 0) & (data['cntry'] != 388)]

# Define a function to perform balancing tests
def perform_balancing_tests(data, gender):
    results = {}

    # Perform balancing tests
    results['migrate_mean'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['migrate'].mean()
    results['migrate_count'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['migrate'].count()
    results['sex_mean'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['sex'].mean()
    results['sex_count'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['sex'].count()
    results['primary_edu_mean'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['primary_edu'].mean()
    results['primary_edu_count'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['primary_edu'].count()

    return results

# Perform balancing tests for males
male_balancing_results = perform_balancing_tests(male_data, 'Male')

# Perform balancing tests for females
female_balancing_results = perform_balancing_tests(female_data, 'Female')

# Print or save the results as needed
print("Male Balancing Results:")
print(male_balancing_results)

print("\nFemale Balancing Results:")
print(female_balancing_results)


In [None]:
# Define a function to perform additional balancing tests for cruts_5avg
def perform_cruts_5avg_balancing_tests(data, gender):
    results = {}

    # Perform balancing tests for cruts_5avg
    results['cruts_5avg_mean'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['cruts_5avg'].mean()
    results['cruts_5avg_count'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['cruts_5avg'].count()

    return results

# Perform additional balancing tests for males for cruts_5avg
male_cruts_5avg_balancing_results = perform_cruts_5avg_balancing_tests(male_data, 'Male')

# Perform additional balancing tests for females for cruts_5avg
female_cruts_5avg_balancing_results = perform_cruts_5avg_balancing_tests(female_data, 'Female')

# Print or save the results as needed
print("Male Balancing Results for cruts_5avg:")
print(male_cruts_5avg_balancing_results)

print("\nFemale Balancing Results for cruts_5avg:")
print(female_cruts_5avg_balancing_results)


In [None]:
# Define a function to perform balancing tests for cruts_5avg by gender
def perform_cruts_5avg_balancing_tests_by_gender(data, gender):
    results = {}

    # Perform balancing tests for cruts_5avg for the specified gender
    results[f'{gender}_cruts_5avg_mean'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['cruts_5avg'].mean()
    results[f'{gender}_cruts_5avg_count'] = data.groupby(['wave', 'affdist', 'aff_cohort_general'])['cruts_5avg'].count()

    return results

# Perform balancing tests for cruts_5avg for females
female_cruts_5avg_balancing_results = perform_cruts_5avg_balancing_tests_by_gender(female_data, 'female')

# Perform balancing tests for cruts_5avg for males
male_cruts_5avg_balancing_results = perform_cruts_5avg_balancing_tests_by_gender(male_data, 'male')

# Print or save the results as needed
print("Female Balancing Results for cruts_5avg:")
print(female_cruts_5avg_balancing_results)

print("\nMale Balancing Results for cruts_5avg:")
print(male_cruts_5avg_balancing_results)
