In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load and clean data
def load_data(file_name):
    current_dir = os.getcwd()
    os.chdir(current_dir)
    data = pd.read_csv(file_name)
    # Drop rows with NaN values
    data.fillna('0')
    return data

In [None]:
# OLS Regression
def perform_ols(data):
    # Defining the model
    independent_vars = ['ln_export_area', 'longitude', 'rain_min', 'humid_max', 'low_temp', 'ln_coastline_area', 
                        'island_dum', 'islam', 'legor_fr', 'region_n', 'ln_avg_gold_pop', 'ln_avg_oil_pop', 
                        'ln_avg_all_diamonds_pop'] + [f'colony{i}' for i in range(8)]
    X = data[independent_vars]
    y = data['ln_maddison_pcgdp2000']
    X = sm.add_constant(X) # Adding a constant term

    model = sm.OLS(y, X).fit()
    return model

In [None]:
# Instrumental Variable Regression - First Stage
def perform_iv_first_stage(data):
    # Independent instrument variables
    instruments = ['atlantic_distance_minimum', 'indian_distance_minimum', 
                   'saharan_distance_minimum', 'red_sea_distance_minimum']
    X = data[instruments]
    y = data['ln_export_area']
    X = sm.add_constant(X) # Adding a constant term

    first_stage = sm.OLS(y, X).fit()
    data['predicted_ln_export_area'] = first_stage.predict(X) # Adding predicted values
    return first_stage, data

In [None]:
# Instrumental Variable Regression - Second Stage
def perform_iv_second_stage(data):
    X = data[['predicted_ln_export_area']]
    y = data['ln_maddison_pcgdp2000']
    X = sm.add_constant(X) # Adding a constant term

    second_stage = sm.OLS(y, X).fit()
    return second_stage

In [None]:
# Plotting scatter plots
def plot_scatter(data, x_var, y_var, title, file_name):
    plt.figure(figsize=(8, 6))
    sns.regplot(x=x_var, y=y_var, data=data)
    plt.title(title)
    plt.savefig(file_name)

In [None]:
# Saving regression summary and summary statistics
def save_summary(model, file_name, data, independent_vars):
    summary = model.summary().as_text()
    summary_stats = data[independent_vars].describe().to_string()
    with open(file_name, 'w') as file:
        file.write(summary + '\n\nSummary Statistics:\n' + summary_stats)

In [None]:
def main():
    # Load data
    data = load_data('replicationData.csv')

    # OLS Regression
    ols_model = perform_ols(data)
    print("OLS Regression Summary:")
    print(ols_model.summary())
    save_summary(ols_model, 'OLS_Regression_Summary.txt', data, ols_model.model.exog_names[1:-8])

    # IV Regression - First Stage
    iv_first_stage, data = perform_iv_first_stage(data)
    print("\nIV Regression First Stage Summary:")
    print(iv_first_stage.summary())
    save_summary(iv_first_stage, 'IV_Regression_First_Summary.txt', data, ['ln_export_area', 'longitude', 'rain_min', 'humid_max', 'low_temp', 'ln_coastline_area', 
                        'island_dum', 'islam', 'legor_fr', 'region_n', 'ln_avg_gold_pop', 'ln_avg_oil_pop', 
                        'ln_avg_all_diamonds_pop'] + [f'colony{i}' for i in range(8)])

    # IV Regression - Second Stage
    iv_second_stage = perform_iv_second_stage(data)
    print("\nIV Regression Second Stage Summary:")
    print(iv_second_stage.summary())
    save_summary(iv_second_stage, 'IV_Regression_Second_Summary.txt', data, ['predicted_ln_export_area'])

    # Scatter Plots
    plot_scatter(data, 'ln_export_area', 'ln_maddison_pcgdp2000', 'ln_export_area vs ln_maddison_pcgdp2000', 'ScatterPlot1.jpg')
    plot_scatter(data, 'ln_pop_dens_1400', 'ln_export_area', 'ln_pop_dens_1400 vs ln_export_area', 'ScatterPlot2.jpg')
    plot_scatter(data, 'ln_export_area', 'ethnic_fractionalization', 'ln_export_area vs Ethnic Fractionalization', 'ScatterPlot3.jpg')

In [None]:
if __name__ == "__main__":
    main()