In [433]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import statsmodels.api as sm

#from skbio.stats.composition import clr
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.decomposition import PCA
from factor_analyzer import FactorAnalyzer

def color_big_red(val):
    color = 'red' if abs(val) >0.4 else 'black'
    return 'color: %s' % color

## Data 

In [434]:
data = pd.read_excel(r'C:\Users\boai-\Desktop\UnumAI\Colorado analysis\2020_CO_terms_cleaned.xlsx')

KeyboardInterrupt: 

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data[['Category', 'Term']].groupby('Category').count()

In [None]:
# Setting up variables for each category.
# Not used but may be helpful
conspiracy = data[data['Category'].isin(['Conspiracy', 'Mueller/Russia'])]
corpreg_econ = data[data['Category'].isin(['Corporate Regulation','Economy'])]
crime = data[data['Category'].isin(['Criminal Justice', 'Public Safety', 'Guns'])]
Health= data[data['Category'].isin([ 'Healthcare'])]
education = data[data['Category'].isin(['Education'])]
electability = data[data['Category'].isin(['Electability'])]
environment = data[data['Category'].isin(['Environmental'])]
ethics = data[data['Category'].isin(['Ethics/Morals'])]
immigration = data[data['Category'].isin(['Immigration'])]
impeachment = data[data['Category'].isin(['Impeachment'])]
law_gov = data[data['Category'].isin(['Campaign Finance', 'Judiciary Branch', 'Generic Policy Queries', 'Voter Turnout', 'Election Security', 'Congress', 'Cybersecurity'])]
media = data[data['Category'].isin(['Media'])]
religion = data[data['Category'].isin(['Religion'])]
rephealth = data[data['Category'].isin(['Reproductive Health'])]
soceq = data[data['Category'].isin(['Social Equality'])]
taxes = data[data['Category'].isin(['Taxes/Spending'])]
trade_ag = data[data['Category'].isin(['Trade', 'Agriculture'])]
war = data[data['Category'].isin(['War'])]

In [None]:
data.head()

# PCA
#### Run if PCA has not already been done

In [None]:
df = data.drop(columns={"Term"}).fillna(0).groupby('Category').mean().T
df.drop(columns={'Space','Impeachment','State/Local'},inplace=True)


In [None]:
df.head()

In [None]:
df['tot'] = df.sum(axis=1)
for i in df.columns:
    df[i] = df[i] / df.tot
df.drop(columns={'tot'},inplace=True)

In [None]:
fa = FactorAnalyzer(n_factors=14, rotation='varimax')
fa.fit(df)

factors = pd.DataFrame(fa.loadings_, index=list(df.columns)).style.applymap(color_big_red)
factors

In [None]:
fa.get_factor_variance()

## Map data to PCA Categories and include dependents

In [None]:
# PCA Category Mappings and Dependent variable files
pca_cats = pd.read_excel('2020 Dems Category Mapping.xlsx')
dependents = pd.read_excel('2020 Dependent.xlsx')

In [None]:
dependents = dependents.rename(columns = {'Unnamed: 0':'Date'})

In [None]:
dependents.head()

In [None]:
reg_df = pd.merge(data,pca_cats,on='Category',how='left').drop(["Term"],axis = 1).groupby('Component').sum().T
reg_df = reg_df.drop('Unnamed: 0', axis = 0)
reg_df = reg_df.reset_index().rename(columns={'index':'Date'})
reg_df['Date'] = pd.to_datetime(reg_df['Date'], infer_datetime_format=True)
reg_df = pd.merge(reg_df,dependents,on='Date',how='inner')
reg_df = reg_df.fillna(0)

In [None]:
# reg_df will be the clean dataframe used for modeling
reg_df.head()

In [None]:
reg_df.columns

# Line plots for each variable
#### Identify spikes and unusual behavior

In [None]:
#Original data, no roll
reg_df_date = reg_df.drop(['Date','Hickenlooper_raw','Hickenlooper_adj' ,'Gardner_raw','Gardner_adj','Electability'], axis = 1)
reg_cols = reg_df_date.columns
for i in reg_cols: 
    plt.plot(reg_df_date[i])
    plt.title(i)
    plt.show()

# Histograms
#### Distribution of each variable

In [None]:
for i in reg_cols:
    plt.hist(reg_df_date[i])
    plt.title(i)
    plt.show()

# Rolling data
#### Use if you want to take a rolling average of the data

### rolling number give you a more steady average in term of weekly 

In [None]:
roll_7 = reg_df.set_index(['Date'])
roll_7 = roll_7.rolling(7).mean()
roll_7 = roll_7.dropna()
roll_7 = roll_7.reset_index()
roll_7.head()

## Line plots for each variable after rolling data 
## Identify spikes and unusual behavior after rolling data 

In [None]:
roll_plot = roll_7.drop(['Date','Hickenlooper_raw','Hickenlooper_adj' ,'Gardner_raw','Gardner_adj','Electability'], axis = 1)
roll_plot_cols = roll_plot.columns


In [None]:
roll_plot_cols

# Rolling Train/Test and data exploration
#### Replace X variables dataframe with non rolled version if necessary

In [None]:
# if not rolling data, rename this variable and replace throughout 
# x = reg_df.drop(...)
x_7 = roll_7.drop(columns={'Date','Hickenlooper_raw','Hickenlooper_adj' ,'Gardner_raw','Gardner_adj','Electability'})

Hickenlooper_adj = roll_7['Hickenlooper_adj']
Gardner_adj = roll_7['Gardner_adj']



x_train_Hickenlooper_adj, x_test_Hickenlooper_adj, y_train_Hickenlooper_adj, y_test_Hickenlooper_adj = train_test_split(x_7, Hickenlooper_adj, test_size=0.3, random_state=42)
x_train_Gardner_adj, x_test_Gardner_adj, y_train_Gardner_adj, y_test_Gardner_adj = train_test_split(x_7, Gardner_adj, test_size=0.3, random_state=42)

### Train Test random split

In [None]:
#Hickenlooper_adj
reg_Hickenlooper_adj = LinearRegression().fit(x_train_Hickenlooper_adj, y_train_Hickenlooper_adj)
pred_train_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_train_Hickenlooper_adj)
pred_test_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_test_Hickenlooper_adj)
print('Hickenlooper_adj:: Train:', round(r2_score(y_train_Hickenlooper_adj,pred_train_Hickenlooper_adj),3),' Test:', round(r2_score(y_test_Hickenlooper_adj,pred_test_Hickenlooper_adj),3))

#Gardner_adj
reg_Gardner_adj = LinearRegression().fit(x_train_Gardner_adj, y_train_Gardner_adj)
pred_train_Gardner_adj = reg_Gardner_adj.predict(x_train_Gardner_adj)
pred_test_Gardner_adj = reg_Gardner_adj.predict(x_test_Gardner_adj)
print('Gardner_adj:: Train:', round(r2_score(y_train_Gardner_adj,pred_train_Gardner_adj),3),' Test:', round(r2_score(y_test_Gardner_adj,pred_test_Gardner_adj),3))



### Train Test chronological split

In [None]:
Hickenlooper_adj_train = roll_7['Hickenlooper_adj'][:200]
Gardner_adj_train = roll_7['Gardner_adj'][:200]

Hickenlooper_adj_test = roll_7['Hickenlooper_adj'][200:]
Gardner_adj_test = roll_7['Gardner_adj'][200:]

x_7_train = x_7[:200]
x_7_test = x_7[200:]

In [None]:
x_7_test.describe()

In [None]:
#Hickenlooper_adj
reg_Hickenlooper_adj = LinearRegression().fit(x_7_train, Hickenlooper_adj_train)
pred_train_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_7_train)
pred_test_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_7_test)

print('Hickenlooper_adj:: Train:', round(r2_score(Hickenlooper_adj_train,pred_train_Hickenlooper_adj),3),' Test:', round(r2_score(Hickenlooper_adj_test,pred_test_Hickenlooper_adj),3))

#Gardner_adj
reg_Gardner_adj = LinearRegression().fit(x_7_train, Gardner_adj_train)
pred_train_Gardner_adj = reg_Gardner_adj.predict(x_7_train)
pred_test_Gardner_adj = reg_Gardner_adj.predict(x_7_test)
print('Gardner_adj:: Train:', round(r2_score(Gardner_adj_train,pred_train_Gardner_adj),3),' Test:', round(r2_score(Gardner_adj_test,pred_test_Gardner_adj),3))



In [None]:
reg_Gardner_adj.score(x_7_test, Hickenlooper_adj_test)

## PCA after Rolling data 

In [None]:
x_7.describe()

In [None]:
x_7['tot'] = x_7.sum(axis=1)
for i in x_7.columns:
    x_7[i] = x_7[i] / x_7.tot
x_7.drop(columns={'tot'},inplace=True)

In [None]:
fa_rd = FactorAnalyzer(n_factors=14, rotation='varimax')
fa_rd.fit(x_7)

factors = pd.DataFrame(fa_rd.loadings_, index=list(x_7.columns)).style.applymap(color_big_red)
factors

In [None]:
fa_rd.get_factor_variance()

# Regression
#### Recommend naming variables per candidate as in steyer_baseline instead of just baseline
#### Adjust first line for model without a constant(intercept)

### Repeat per candidate -- Gardner 

In [None]:
Gardner_adj_baseline = sm.add_constant(x_7)
mod = sm.OLS(Gardner_adj, Gardner_adj_baseline)
res = mod.fit()
print(res.summary())
# This is the baseline model with all variables

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['Trade_Agr'], axis = 1)

In [None]:
Gardner_adj_final_1 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_1)
res = mod.fit()
res.summary()
#print(res.summary())
# This is the final model with significant variables remaining

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['War'], axis = 1)

In [None]:
Gardner_adj_final_2 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_2)
res = mod.fit()
res.summary()

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['Ethics'], axis = 1)

In [None]:
Gardner_adj_final_3 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_3)
res = mod.fit()
res.summary()

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['ReprHealth'], axis = 1)

In [None]:
Gardner_adj_final_4 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_4)
res = mod.fit()
res.summary()

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['Environment'], axis = 1)

In [None]:
Gardner_adj_final_5 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_5)
res = mod.fit()
res.summary()

In [None]:
Gardner_adj_baseline = Gardner_adj_baseline.drop(['Religion'], axis = 1)

In [None]:
Gardner_adj_final_6 = Gardner_adj_baseline
mod = sm.OLS(Gardner_adj, Gardner_adj_final_6)
res = mod.fit()
res.summary()

In [None]:
# Correlation Matrix of significant variables
Gardner_adj_final_6.drop(['const'], axis = 1).corr()

In [None]:
corrmat = Gardner_adj_final_6.drop(['const'], axis = 1).corr()
  
f, ax = plt.subplots(figsize =(10, 10)) 
sns.heatmap(corrmat, ax = ax, cmap ="coolwarm", linewidths = 0.1)

In [None]:
# Variance Inflation Factor of significant variables
pd.Series([variance_inflation_factor(Gardner_adj_final_6.values, i) 
               for i in range(Gardner_adj_final_6.shape[1])], 
              index=Gardner_adj_final_6.columns).sort_values()

In [None]:
Gardner_adj_final_6.describe()

## time split testing data and training data R2

In [None]:
x_7.head()

In [None]:
Gardner_adj_final_6.describe()

In [None]:
x_train_Gardner_adj_final_6, x_test_Gardner_adj_final_6, y_train_Gardner_adj_final_6, y_test_Gardner_adj_final_6 = train_test_split(x_7, Gardner_adj_final_6, test_size=0.3, random_state=42)

In [None]:
x_test_Gardner_adj_final_6.describe()

In [None]:
#Gardner_adj_final_6
reg_Gardner_adj_final_6 = LinearRegression().fit(x_train_Gardner_adj_final_6, y_train_Gardner_adj_final_6)
post_train_Gardner_adj_final_6 = reg_Gardner_adj_final_6.predict(x_train_Gardner_adj_final_6)
post_test_Gardner_adj_final_6 = reg_Gardner_adj_final_6.predict(x_test_Gardner_adj_final_6)
print('Gardner_adj_final_6:: Train:', round(r2_score(y_train_Gardner_adj_final_6,post_train_Gardner_adj_final_6),3),' Test:', round(r2_score(y_test_Gardner_adj_final_6,post_test_Gardner_adj_final_6),3))



# ===================================

# ===================================

# Candidate Term Attribution Models
#### Models using only the terms that the candidates have used

In [None]:
#Read in attribution file
attribution = pd.read_excel(r'C:\Users\boai-\Desktop\UnumAI\Colorado analysis\2020 CO Attribution.xlsx')
attribution.head()

In [None]:
attribution['Term'] = attribution['Unnamed: 0']

In [None]:
#join with original data
att_data = data.merge(attribution, on = 'Term', how = 'inner')
att_data = att_data.fillna(0)
att_data.shape
#att_data.head()

In [None]:
att_data.head()

In [None]:
Hickenlooper_att = att_data[att_data['John Hickenlooper'] > 0]
Hickenlooper_att = Hickenlooper_att.drop(['Unnamed: 0_x','Unnamed: 0_y','Cory Gardner','Other Dems'],axis = 1)

In [None]:
Hickenlooper_att.head()

In [None]:
H_group = Hickenlooper_att[['Category', 'Term']].groupby('Category').count()
H_group.head()

In [None]:
H_group.shape

In [None]:
Gardner_att = att_data[att_data['Cory Gardner'] > 0]
Gardner_att = Gardner_att.drop(['Unnamed: 0_x','Unnamed: 0_y','John Hickenlooper','Other Dems'],axis = 1)

In [None]:
Gardner_att.head()

In [None]:
G_group = Gardner_att[['Category', 'Term']].groupby('Category').count()
G_group.head()

In [None]:
G_group.shape

In [None]:
result = pd.merge(H_group, G_group, how='left', on=['Category'])
result.columns = ['John Hicklooper','Cory Gardner']
result.head()

In [None]:
#result = result.reset_index()
#result = result.rename(columns = { 'index': "Category"})
#result.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


#df3=result[['Cory Gardner','John Hicklooper']].set_index('Category')
#print(df3)
result.plot.bar()             # standard bar chart
plt.tight_layout()
plt.show()


In [None]:
reg_df_Gardner = pd.merge(Gardner_att,pca_cats,on='Category',how='left').drop(["Term"],axis = 1).groupby('Component').sum().T


In [None]:
reg_df_Gardner.head()

In [None]:
#reg_df_Gardner = reg_df_Gardner.drop('Unnamed: 0', axis = 0)
reg_df_Gardner = reg_df_Gardner.reset_index().rename(columns={'index':'Date'})
reg_df_Gardner['Date'] = pd.to_datetime(reg_df_Gardner['Date'], infer_datetime_format=True)
reg_df_Gardner = pd.merge(reg_df_Gardner,dependents,on='Date',how='inner')
reg_df_Gardner = reg_df_Gardner.fillna(0)
reg_df_Gardner.head()

In [None]:
reg_df_Gardner = reg_df_Gardner.drop(['Date','Hickenlooper_raw','Hickenlooper_adj' ,'Gardner_raw','Gardner_adj','Electability'], axis = 1)


In [None]:
reg_df_Gardner.head()

# =======================================

# Gardner - attribution model

In [None]:
# Removing Electability, Date, and other Candidates
Gardner_att_data = reg_df_Gardner

In [None]:
Gardner_att_baseline.shape

In [None]:
Gardner_att.shape

In [None]:
Gardner_adj.shape

In [None]:
Gardner_att_baseline = sm.add_constant(Gardner_att_data)
mod = sm.OLS(Gardner_adj, Gardner_att_baseline)
res = mod.fit()
print(res.summary())

In [None]:
steyer_att_final = sm.add_constant(steyer_att_data.drop([***Drop Columns Here***], axis = 1))
mod = sm.OLS(steyer, steyer_att_final)
res = mod.fit()
res.summary()

In [None]:
steyer_att_final.corr()

In [None]:
pd.Series([variance_inflation_factor(steyer_att_final.values, i) 
               for i in range(steyer_att_final.shape[1])], 
              index=steyer_att_final.columns).sort_values()

## regression model for term Immpeachment 

In [463]:
pca_cat = pd.read_excel('2020 Dems Category Mapping.xlsx')
dependents = pd.read_excel('2020 Dependent.xlsx')

In [464]:
data = pd.read_excel(r'C:\Users\boai-\Desktop\UnumAI\Colorado analysis\2020_CO_terms_Law_Govt_cleaned.xlsx')

In [465]:
dependents = dependents.rename(columns = {'Unnamed: 0':'Date'})

In [466]:
dependents.shape

(213, 5)

In [467]:
dependents

Unnamed: 0,Date,Hickenlooper_raw,Gardner_raw,Hickenlooper_adj,Gardner_adj
0,2019-02-15,0.431907,0.568093,0.474177,0.525823
1,2019-02-16,0.431490,0.568510,0.474032,0.525968
2,2019-02-17,0.430931,0.569069,0.473836,0.526164
3,2019-02-18,0.430737,0.569263,0.473768,0.526232
4,2019-02-19,0.432308,0.567692,0.474318,0.525682
...,...,...,...,...,...
208,2019-09-11,0.561576,0.438424,0.519562,0.480438
209,2019-09-12,0.561216,0.438784,0.519436,0.480564
210,2019-09-13,0.561136,0.438864,0.519408,0.480592
211,2019-09-14,0.561026,0.438974,0.519369,0.480631


In [468]:
data.head()

Unnamed: 0.1,Unnamed: 0,Term,Category,2019-02-15 00:00:00,2019-02-16 00:00:00,2019-02-17 00:00:00,2019-02-18 00:00:00,2019-02-19 00:00:00,2019-02-20 00:00:00,2019-02-21 00:00:00,...,2019-10-06 00:00:00,2019-10-07 00:00:00,2019-10-08 00:00:00,2019-10-09 00:00:00,2019-10-10 00:00:00,2019-10-11 00:00:00,2019-10-12 00:00:00,2019-10-13 00:00:00,2019-10-14 00:00:00,2019-10-15 00:00:00
0,0,115th congress,Law_Govt,8e-06,3.1e-05,1.1e-05,1e-05,2.3e-05,9e-06,,...,,9e-06,,8e-06,,9e-06,,,,1.6e-05
1,1,116th congress,Law_Govt,3.5e-05,,,2.7e-05,1.4e-05,2.4e-05,2.3e-05,...,4.6e-05,3.4e-05,2.4e-05,8e-06,1.6e-05,9e-06,,1.3e-05,2.7e-05,2.2e-05
2,2,2 house,Law_Govt,4.2e-05,1.3e-05,1.8e-05,4.5e-05,4.6e-05,3.9e-05,2.9e-05,...,3.6e-05,7.9e-05,7.1e-05,6.2e-05,5.5e-05,5.8e-05,5.8e-05,1.9e-05,6.3e-05,5.2e-05
3,3,3 branches of government,Law_Govt,6.2e-05,3.1e-05,5.5e-05,6.2e-05,6.2e-05,5.2e-05,5.5e-05,...,6.5e-05,7.7e-05,5.4e-05,7e-05,8.6e-05,6.1e-05,2.4e-05,4.7e-05,4.4e-05,7.6e-05
4,4,achieved,Law_Govt,1.1e-05,1.5e-05,2e-05,1.1e-05,,2.4e-05,1.1e-05,...,,1.4e-05,1.1e-05,2.4e-05,,1.3e-05,1.3e-05,,1.2e-05,


In [469]:
# importing pandas as pd 
import pandas as pd 
data = data.drop('Unnamed: 0',axis =1)
data = data.drop('Category',axis =1)


In [470]:
data = data.transpose()
new_header = data.iloc[0] #grab the first row for the header
data = data[1:] #take the data less the header row
data.columns = new_header #set the header row as the df header
data = data.reset_index()
data = data.rename(columns = { 'index': "Date"})
data.head()

Term,Date,115th congress,116th congress,2 house,3 branches of government,achieved,achievements,advocate,advocates,advocating,...,sponsors,stacey abrams,subsidiary,us companies,use act,vote,vote of confidence,vote today,voted,voter
0,2019-02-15,7.59807e-06,3.49511e-05,4.24407e-05,6.24127e-05,1.09222e-05,2.73056e-05,0.000149791,4.99302e-05,1.24825e-05,...,5.46112e-06,1.31067e-05,7.82488e-06,1.04021e-05,1.17373e-05,0.000436889,,3.12995e-05,0.000227182,2.91259e-05
1,2019-02-16,3.07001e-05,,1.28864e-05,3.07001e-05,1.52603e-05,,0.000142337,4.74455e-05,1.58152e-05,...,1.14452e-05,2.71823e-05,1.44973e-05,9.31966e-06,1.28864e-05,0.000260951,,6.44322e-06,8.11846e-05,3.47934e-05
2,2019-02-17,1.12353e-05,,1.75856e-05,5.51549e-05,2.02235e-05,4.44916e-05,0.000242682,4.04469e-05,2.02235e-05,...,1.96617e-05,1.44453e-05,1.41564e-05,,1.41564e-05,0.000219088,8.08939e-06,,5.05587e-05,5.05587e-05
3,2019-02-18,9.66475e-06,2.73834e-05,4.54449e-05,6.16128e-05,1.06e-05,1.06e-05,0.000213591,6.57203e-05,3.28601e-05,...,1.23226e-05,5.86788e-06,1.17358e-05,1.91684e-05,2.05376e-05,0.000205376,5.86788e-06,5.86788e-06,1.82556e-05,2.28195e-05
4,2019-02-19,2.29852e-05,1.42289e-05,4.56011e-05,6.18222e-05,,3.3632e-05,0.000211655,3.73509e-05,1.24503e-05,...,1.06717e-05,4.1501e-06,5.10782e-06,9.05476e-06,1.66004e-05,0.000209165,,5.10782e-06,6.8149e-05,5.24223e-05


In [471]:
#data[212:]
data = data.drop(data.index[213:])


In [472]:
all_df_list = [data, dependents]


In [473]:
data.shape

(213, 694)

In [474]:
dependents.shape

(213, 5)

In [488]:
reg_df = pd.merge(data,dependents,on='Date',how='left')

In [489]:
reg_df = reg_df.fillna(0)

In [490]:
reg_df.head()

Unnamed: 0,Date,115th congress,116th congress,2 house,3 branches of government,achieved,achievements,advocate,advocates,advocating,...,use act,vote,vote of confidence,vote today,voted,voter,Hickenlooper_raw,Gardner_raw,Hickenlooper_adj,Gardner_adj
0,2019-02-15,8e-06,3.5e-05,4.2e-05,6.2e-05,1.1e-05,2.7e-05,0.00015,5e-05,1.2e-05,...,1.2e-05,0.000437,0.0,3.1e-05,0.000227,2.9e-05,0.431907,0.568093,0.474177,0.525823
1,2019-02-16,3.1e-05,0.0,1.3e-05,3.1e-05,1.5e-05,0.0,0.000142,4.7e-05,1.6e-05,...,1.3e-05,0.000261,0.0,6e-06,8.1e-05,3.5e-05,0.43149,0.56851,0.474032,0.525968
2,2019-02-17,1.1e-05,0.0,1.8e-05,5.5e-05,2e-05,4.4e-05,0.000243,4e-05,2e-05,...,1.4e-05,0.000219,8e-06,0.0,5.1e-05,5.1e-05,0.430931,0.569069,0.473836,0.526164
3,2019-02-18,1e-05,2.7e-05,4.5e-05,6.2e-05,1.1e-05,1.1e-05,0.000214,6.6e-05,3.3e-05,...,2.1e-05,0.000205,6e-06,6e-06,1.8e-05,2.3e-05,0.430737,0.569263,0.473768,0.526232
4,2019-02-19,2.3e-05,1.4e-05,4.6e-05,6.2e-05,0.0,3.4e-05,0.000212,3.7e-05,1.2e-05,...,1.7e-05,0.000209,0.0,5e-06,6.8e-05,5.2e-05,0.432308,0.567692,0.474318,0.525682


In [491]:
# if not rolling data, rename this variable and replace throughout 
# x = reg_df.drop(...)
x_7 = reg_df.drop(columns={'Date','Hickenlooper_raw','Hickenlooper_adj' ,'Gardner_raw','Gardner_adj'}, axis = 1)

Hickenlooper_adj = reg_df['Hickenlooper_adj']
Gardner_adj = reg_df['Gardner_adj']



x_train_Hickenlooper_adj, x_test_Hickenlooper_adj, y_train_Hickenlooper_adj, y_test_Hickenlooper_adj = train_test_split(x_7, Hickenlooper_adj, test_size=0.3, random_state=42)
x_train_Gardner_adj, x_test_Gardner_adj, y_train_Gardner_adj, y_test_Gardner_adj = train_test_split(x_7, Gardner_adj, test_size=0.3, random_state=42)

In [492]:
#Hickenlooper_adj
reg_Hickenlooper_adj = LinearRegression().fit(x_train_Hickenlooper_adj, y_train_Hickenlooper_adj)
pred_train_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_train_Hickenlooper_adj)
pred_test_Hickenlooper_adj = reg_Hickenlooper_adj.predict(x_test_Hickenlooper_adj)
print('Hickenlooper_adj:: Train:', round(r2_score(y_train_Hickenlooper_adj,pred_train_Hickenlooper_adj),3),' Test:', round(r2_score(y_test_Hickenlooper_adj,pred_test_Hickenlooper_adj),3))

#Gardner_adj
reg_Gardner_adj = LinearRegression().fit(x_train_Gardner_adj, y_train_Gardner_adj)
pred_train_Gardner_adj = reg_Gardner_adj.predict(x_train_Gardner_adj)
pred_test_Gardner_adj = reg_Gardner_adj.predict(x_test_Gardner_adj)
print('Gardner_adj:: Train:', round(r2_score(y_train_Gardner_adj,pred_train_Gardner_adj),3),' Test:', round(r2_score(y_test_Gardner_adj,pred_test_Gardner_adj),3))



Hickenlooper_adj:: Train: 1.0  Test: 0.364
Gardner_adj:: Train: 1.0  Test: 0.364


In [495]:
Hickenlooper_adj_train = roll_7['Hickenlooper_adj'][:200]
Gardner_adj_train = roll_7['Gardner_adj'][:200]

Hickenlooper_adj_test = roll_7['Hickenlooper_adj'][200:]
Gardner_adj_test = roll_7['Gardner_adj'][200:]

x_7_train = x_7[:200]
x_7_test = x_7[200:]

In [496]:
x_7_test.describe()

Unnamed: 0,115th congress,116th congress,2 house,3 branches of government,achieved,achievements,advocate,advocates,advocating,affairs,...,sponsors,stacey abrams,subsidiary,us companies,use act,vote,vote of confidence,vote today,voted,voter
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,...,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,2e-06,1.6e-05,4.5e-05,0.000103,1e-05,1.8e-05,0.000195,4.6e-05,1.7e-05,0.0006,...,1.2e-05,7e-06,6e-06,8e-06,1.6e-05,0.000227,6e-06,8e-06,3.1e-05,2.9e-05
std,4e-06,1e-05,1.5e-05,6e-05,7e-06,1.6e-05,4.6e-05,1.5e-05,1.6e-05,9e-05,...,3e-06,7e-06,4e-06,5e-06,5e-06,7e-05,3e-06,5e-06,1.5e-05,1.4e-05
min,0.0,0.0,1.9e-05,3.7e-05,0.0,0.0,0.00014,2.3e-05,0.0,0.000449,...,6e-06,0.0,0.0,0.0,7e-06,0.000123,0.0,0.0,5e-06,1.4e-05
25%,0.0,1.5e-05,3.6e-05,8.3e-05,0.0,1.1e-05,0.000148,3.4e-05,1.4e-05,0.000569,...,9e-06,0.0,6e-06,5e-06,1.3e-05,0.000182,6e-06,6e-06,2.1e-05,1.9e-05
50%,0.0,1.7e-05,4.6e-05,9.2e-05,1.1e-05,1.6e-05,0.000182,4.4e-05,1.6e-05,0.000628,...,1.2e-05,6e-06,6e-06,9e-06,1.5e-05,0.000211,6e-06,6e-06,2.8e-05,2.6e-05
75%,0.0,1.8e-05,5.6e-05,0.000124,1.4e-05,1.9e-05,0.00024,5.6e-05,1.7e-05,0.000643,...,1.5e-05,7e-06,9e-06,1.1e-05,1.9e-05,0.000275,9e-06,1e-05,4.2e-05,3.7e-05
max,1e-05,3.9e-05,6.9e-05,0.000268,2.1e-05,6e-05,0.00025,6.6e-05,5.4e-05,0.000747,...,1.8e-05,1.9e-05,1.4e-05,1.8e-05,2.8e-05,0.000385,1.1e-05,1.6e-05,5.9e-05,6.2e-05


In [498]:
Gardner_adj_baseline = sm.add_constant(x_7)
mod = sm.OLS(Gardner_adj, Gardner_adj_baseline)
res = mod.fit()
print(res.summary())
# This is the baseline model with all variables


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



                            OLS Regression Results                            
Dep. Variable:            Gardner_adj   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                     0.000
Date:                Mon, 18 Nov 2019   Prob (F-statistic):                nan
Time:                        15:28:20   Log-Likelihood:                 7047.1
No. Observations:                 213   AIC:                        -1.367e+04
Df Residuals:                       0   BIC:                        -1.295e+04
Df Model:                         212                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------


divide by zero encountered in true_divide


invalid value encountered in double_scalars


divide by zero encountered in double_scalars


divide by zero encountered in double_scalars


invalid value encountered in multiply


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal

