In [569]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV

In [570]:
url1 = "https://raw.githubusercontent.com/FeWach/Election_Outcome_Prediction_Model/main/usa_indicators.csv"
url2 = "https://raw.githubusercontent.com/FeWach/Election_Outcome_Prediction_Model/main/usa_gallup.csv"
url3 = "https://raw.githubusercontent.com/FeWach/Election_Outcome_Prediction_Model/main/usa_elections_narrow.csv"

indicators_usa = pd.read_csv(url1)
gallup_usa = pd.read_csv(url2)
narrow_usa = pd.read_csv(url3)

In [571]:
indicators = indicators_usa.copy()
elections = elections_usa.copy()
gallup = gallup_usa.copy()
narrow = narrow_usa.copy()

In [572]:
indicators = indicators.drop('country', axis=1)
elections = elections.drop('country', axis=1)
gallup = gallup.drop('country', axis=1)
narrow = narrow.drop('country', axis=1)

##

U.S. inflation data is available going back to 1803, however, we will only train the model on those years with sufficient data with 'narrow' being the table with a more narrow timeframe to include as much reliable data as possible for the time since 1892.

##

In [573]:
merged_narrow = pd.merge(narrow, indicators, on='year')
merged_narrow.head()

Unnamed: 0,id,year,outgoing,majority_change,gallup_prediction,result,GDP_growth,unemployment_rate,inflation_rate,approval_rating_incumbent,war_period,major_events
0,1,1892,0,1,,1,,3.0,0.0,,0,0
1,2,1896,0,1,,1,,14.4,0.0,,0,0
2,3,1900,0,0,,0,,5.0,0.0,,1,0
3,4,1904,0,0,,0,,5.4,0.0,,0,0
4,5,1908,1,0,,0,,8.0,-3.6,,0,0


## Gallup Prediction Accuracy

Calculation of the performance of the Gallup predictions which will be the benchmark for our model to beat

In [574]:
true = gallup.pop('result')
pred = gallup.pop('gallup_prediction')

In [575]:
accuracy_score(true,
               pred)

0.8181818181818182

##

In [576]:
X = merged_narrow.drop(['id', 'year', 'gallup_prediction'], axis=1)
y = X.pop('result')

In [577]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [578]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

In [579]:
imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()
rfor = RandomForestClassifier(max_depth=None,
                               random_state=42)

In [580]:
pipe = make_pipeline(imputer, scaler, rfor).set_output(transform='pandas')
pipe

In [581]:
model = pipe.fit(X_train, y_train)

In [582]:
prediction = model.predict(X_test)

In [583]:
accuracy_score(y_true = y_test,
               y_pred = prediction)

0.5714285714285714

## Calibrating the model

In [584]:
# fit base model on training dataset
model.fit(X_train, y_train)
# calibrate model on validation data
calibrator = CalibratedClassifierCV(model, cv='prefit')
calib_model = calibrator.fit(X_val, y_val)
# evaluate the model
val_pred = calibrator.predict(X_test)

In [585]:
accuracy_score(y_true = y_test,
               y_pred = val_pred)

0.8571428571428571

## Pickling the model

In [468]:
with open("calib_election_model.pckl", "wb") as p:
    pickle.dump(calib_model, p)

In [None]:
# with open("election_model.pckl", "rb") as p:
#    model = pickle.load(p)

## End of pickling

## Applying the predictions to the original full dataset

In [594]:
pred_test_table_raw = pd.read_csv("https://raw.githubusercontent.com/FeWach/Election_Outcome_Prediction_Model/main/usa_final.csv")
pred_test_table = pred_test_table_raw.drop(['country', 'year', 'id'], axis=1)

In [595]:
prediction_final = calib_model.predict(pred_test_table)

In [596]:
prediction_final_list = list(prediction_final)
prediction_final_list

[1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1]

In [597]:
pred_test_table_raw['prediction'] = prediction_final_list

In [598]:
pred_test_table_raw

Unnamed: 0,id,country,year,outgoing,majority_change,GDP_growth,unemployment_rate,inflation_rate,approval_rating_incumbent,war_period,major_events,prediction
0,1,USA,1892,0,1,3.31,3.0,0.0,50.75,0,0,1
1,2,USA,1896,0,1,3.31,14.4,0.0,50.75,0,0,1
2,3,USA,1900,0,0,3.31,5.0,0.0,50.75,1,0,0
3,4,USA,1904,0,0,3.31,5.4,0.0,50.75,0,0,0
4,5,USA,1908,1,0,3.31,8.0,-3.6,50.75,0,0,0
5,6,USA,1912,1,1,3.31,4.6,3.6,50.75,0,0,1
6,7,USA,1916,0,0,3.31,5.1,7.9,50.75,1,0,0
7,8,USA,1920,1,1,3.31,5.2,15.6,50.75,0,0,1
8,9,USA,1924,0,0,3.75,5.0,0.0,50.75,0,0,0
9,10,USA,1928,0,0,1.1,4.2,-1.7,50.75,0,0,0


In [599]:
narrow_usa["prediction"] = prediction_final
model_performance = narrow_usa[["id", "prediction"]]
pred_true_comparison = narrow_usa[["id", "prediction", "result"]]

In [600]:
model_performance

Unnamed: 0,id,prediction
0,1,1
1,2,1
2,3,0
3,4,0
4,5,0
5,6,1
6,7,0
7,8,1
8,9,0
9,10,0


In [601]:
pred_true_comparison

Unnamed: 0,id,prediction,result
0,1,1,1
1,2,1,1
2,3,0,0
3,4,0,0
4,5,0,0
5,6,1,1
6,7,0,0
7,8,1,1
8,9,0,0
9,10,0,0


## Accuracy of calibrated model

In [602]:
true_model = narrow_usa.pop('result')
pred_model = model_performance.pop('prediction')

In [603]:
accuracy_score(true_model,
               pred_model)

0.9090909090909091