<a href="https://colab.research.google.com/github/Daniel-Brett-Stark/PortfolioProjects/blob/main/LassoProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.impute import KNNImputer
from sklearn import linear_model
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from collections import deque
import statsmodels.api as sm

import warnings

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


#### Clean the Data

In [3]:
# import df
df = pd.read_csv('/content/gdrive/My Drive/Econ 484/MergeDataUnemployAllCountries.csv')

In [4]:
# first attempt to convert dtypes
df_conv = df.convert_dtypes()

In [5]:
# need to shift unemployment rate one year back for each country
# find codes for each country
country_codes = list(df_conv['countrycode'].unique())

with warnings.catch_warnings() as w:
    warnings.simplefilter('ignore')
    # create sub dataframes for each country and append to list
    sub_dfs = []
    for i in country_codes:
        sub_dfs.append(df_conv[df_conv['countrycode'] == i])

    # loop through sub-dfs and perform
    for d in sub_dfs:
        ur = deque(d['value55'])
        ur.popleft()
        ur.append(0)
        d['value55'] = ur  

    # concatenate sub-dfs
    agg_df = pd.concat(sub_dfs, axis=0)
    df_conv = agg_df.copy()

In [6]:
# isolate int df
df_int = df_conv.loc[:, [x for x in df_conv.columns if x not in ['countryname', '_merge']]]

# get list of df columns
df_col_lst = [c for c in df_int.columns]

# iterate through df column list
for i, col in enumerate(df_col_lst):
    # if column name not 'countrycode'
    if col != "countrycode":
        # if df column not numeric
        if not is_numeric_dtype(df_conv[col]): 
            try:
                # try to convert to float type
                df_int[col] = df_int[col].astype(np.float64)
            # except a value error
            except ValueError as e:
                # cast series to list
                values = list(df_int[col])
                # make a new list that is dtype neutral
                new_values = [0] * len(values) 
                # iterate through series values in list
                for i, v in enumerate(values):
                    try:
                        # try to convert each value to float
                        v = float(v)
                        new_values[i] = v
                    except ValueError:
                        # if not possible, fill with np.nan
                        v = np.nan
                        new_values[i] = v
                # replace original column with new series
                df_int[col] = new_values

# make a new dataframe 
df_nan = df_int.copy()

df_sparse = df_nan.loc[:, [x for x in df_nan.columns if x != "countrycode"]]

In [7]:
# impute np.nan
imputer = KNNImputer(n_neighbors=6, weights='distance',)
df_imp = imputer.fit_transform(df_sparse)
df_filled = pd.DataFrame(df_imp, columns=df_sparse.columns)
df_filled.head()

Unnamed: 0,year,multiple,success,suicide,nkill,nwound,value1,value2,value3,value4,...,value47,value48,value49,value50,value51,value52,value53,value54,value55,value56
0,2000.0,0.0,14.0,0.0,38.0,11.0,153.951,17.564277,52.028783,12.4,...,0.975717,58.400833,47.066355,652860.0,17.927334,0.870934,69.533311,22.65861,19.03097,3.392644
1,2001.0,3.0,14.0,1.0,174.0,59.0,150.192,22.775604,167.122933,81.00364,...,0.0,12.32462,64.714019,652860.0,14.412032,6.721075,25.988566,10.46224,19.087431,4.31413
2,2002.0,2.0,28.0,2.0,74.0,297.0,146.433,38.627892,43.015907,43.374123,...,0.784725,23.973229,28.529082,652860.0,13.524123,10.531112,22.532864,2.023935,40.055107,4.910849
3,2003.0,8.0,93.0,1.0,163.0,186.0,143.7648,37.418855,33.942762,14.3,...,0.54426,12.54889,38.909696,652860.0,9.829646,14.601887,15.533888,1.058172,40.024896,5.080775
4,2004.0,0.0,79.0,3.0,275.0,263.0,141.0966,29.721067,20.623569,37.576335,...,0.40582,17.568081,24.444444,652860.0,3.691927,12.868054,9.5,0.717118,41.175943,4.977038


In [8]:
# # create feature space
feature_df = df_filled.filter(axis=1, like="value")
with warnings.catch_warnings() as warn:
    warnings.simplefilter('ignore')
    feature_df.loc[:, "year"] = df_filled.loc[:, 'year']
    feature_df.loc[:, 'countrycode'] = df_nan.loc[:, 'countrycode']

feature_df.head()

Unnamed: 0,value1,value2,value3,value4,value5,value6,value7,value8,value9,value10,...,value49,value50,value51,value52,value53,value54,value55,value56,year,countrycode
0,153.951,17.564277,52.028783,12.4,0.037055,5.3,71.489959,2693.659733,1678.006343,24.26321,...,47.066355,652860.0,17.927334,0.870934,69.533311,22.65861,19.03097,3.392644,2000.0,AFG
1,150.192,22.775604,167.122933,81.00364,0.037488,31.793778,17.972248,1955.292465,1237.409227,19.158266,...,64.714019,652860.0,14.412032,6.721075,25.988566,10.46224,19.087431,4.31413,2001.0,AFG
2,146.433,38.627892,43.015907,43.374123,0.048671,16.021135,42.748197,1914.16084,1099.609148,27.122062,...,28.529082,652860.0,13.524123,10.531112,22.532864,2.023935,40.055107,4.910849,2002.0,AFG
3,143.7648,37.418855,33.942762,14.3,0.057008,10.3,18.25151,1915.753573,1099.80689,23.028887,...,38.909696,652860.0,9.829646,14.601887,15.533888,1.058172,40.024896,5.080775,2003.0,AFG
4,141.0966,29.721067,20.623569,37.576335,0.0457,20.678618,19.947047,1887.116961,1087.000756,17.452159,...,24.444444,652860.0,3.691927,12.868054,9.5,0.717118,41.175943,4.977038,2004.0,AFG


In [9]:
# create outcome df

outcome_var = "success"
outcome_ser = df_filled.loc[:, outcome_var]
outcome_ser

0       14.0
1       14.0
2       28.0
3       93.0
4       79.0
        ... 
1492     3.0
1493     1.0
1494     3.0
1495     1.0
1496     2.0
Name: success, Length: 1497, dtype: float64

In [10]:
# Uncomment to send data out to a csv

# output_file_path = "clean_econ_terror.csv"
# feature_df[outcome_var] = outcome_ser
# feature_df.to_csv(output_file_path)

#### Analyze the Data

In [11]:
# assign outcome, treatment, and features
y = outcome_ser
d = feature_df.loc[:,'value55']
X = feature_df.drop(columns=['value55', 'countrycode',], axis=1)

# Set test train split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)
X_train, X_test, y_train, y_test, indicies_train, indicies_test = train_test_split(X, y, np.arange(X.shape[0]), test_size=.33, random_state=42)

# also split the treatment variable
d_train = d[indicies_train]
d_test = d[indicies_test]

In [12]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    # fit lasso model on features and outcome
    lassoy = LassoCV(normalize=True).fit(X_train, y_train)

    # fit lasso model on features and treatment
    lassod = LassoCV(normalize=True).fit(X_train, d_train)

    # grab features with non-zero coefficients
    Xunion = X_train.iloc[:, (lassod.coef_!=0) + (lassoy.coef_!=0)]
    
    # concatenate treatment and features with non-zero coefficients
    rhs = pd.concat([d_train, Xunion],axis=1)

    # regress treatment and non-zero coefficients on outomce
    fullreg=linear_model.LinearRegression().fit(rhs, y_train)

    # print coefficient
    print("PDS regression effect of unemployment on terror: {:.3f}".format(fullreg.coef_[0]))

PDS regression effect of unemployment on terror: 0.321


In [13]:
# define models
modely = LassoCV(normalize=True)
modeld = LassoCV(normalize=True)

# modely = RandomForestRegressor()
# modeld = RandomForestRegressor()

# create our sample splitting "object"
kf = KFold(n_splits=3,shuffle=True, random_state=12)

# apply the splits to our Xs
kf.get_n_splits(X)

# initialize array to hold each fold's regression coefficient
coeffs=np.zeros(3)
pvals = np.zeros(3)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    # Now loop through each fold
    ii=0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        d_train, d_test = d.iloc[train_index], d.iloc[test_index]

        # Do DDML thing
        # Ridge y on training folds:
        modely.fit(X_train, y_train)

        # but get residuals in test set
        yresid=y_test-modely.predict(X_test)

        #Ridge d on training folds
        modeld.fit(X_train,d_train)

        #but get residuals in test set
        dresid=d_test-modeld.predict(X_test)

        # regress resids on resids       
        dresid = sm.add_constant(dresid)
        mod = sm.OLS(yresid, dresid).fit()

        # save coefficient in a vector
        coeffs[ii]=mod.params[1]
        pvals[ii]=mod.pvalues[1]
        ii+=1

# Take average
print("Double-Debiased Machine Learning effect of recessions on terrorism: {:.3f}".format(np.mean(coeffs)))
print("Lasso Coefficients: {}".format(coeffs))
print(f"P-values: {pvals}")
print(f"Avg P-values: {np.mean(pvals)}")

Double-Debiased Machine Learning effect of recessions on terrorism: 1.153
Lasso Coefficients: [ 2.27747044 -1.77872585  2.96056012]
P-values: [0.01376954 0.04729613 0.01167329]
Avg P-values: 0.024246318197873807
