In [7]:
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score  # Include accuracy_score for evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV,ShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sqlalchemy import create_engine, inspect
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold
import threading

In [8]:
pres_gdp = pd.read_csv("president_gdp-94.csv")
pres_terms = pd.read_csv("president_terms-94.csv",encoding="UTF-8")

In [9]:
pres_gdp.columns = pres_gdp.columns.str.lower()
pres_gdp.columns = pres_gdp.columns.str.replace(" ","_")

In [10]:
pres_terms.columns = pres_terms.columns.str.lower()
pres_terms.columns = pres_terms.columns.str.replace(" ","_")

In [11]:
# PostgreSQL connection parameters
# postgres_params = {
#     "database": "postgres", # Insert the name of database
#     "user": "postgres",     # Insert username if different
#     "password": "!QAZ1qaz",        # Insert your password
#     "host": "localhost",    # Replace with your PostgreSQL host if not local
#     "port": 5432,
#     "options": "-c search_path=dbo,public"}

# Establish a connection to PostgreSQL
# conn_str = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(**postgres_params)
# postgres_engine = create_engine(conn_str)

# Write DataFrame to PostgreSQL
# pres_gdp.to_sql('pres_gdp', con=postgres_engine, index=False, if_exists='replace')
# pres_terms.to_sql('pres_terms', con=postgres_engine, index=False, if_exists='replace')

In [12]:
# query="""
# select * from pres_terms;
# """
# pres_terms = pd.read_sql(sql= query, con= postgres_engine)

In [13]:
# query1="""
# select * from pres_gdp;
# """
# pres_gdp= pd.read_sql(sql= query1, con= postgres_engine)

In [14]:
cat_c = ['Year', 'Name', 'VP', 'BDay', 'Party', 'sign', 'BMonth', 'GDP Growth', 'Population Growth']
cat_c = [x.lower() for x in cat_c]
cat_c = [x.replace(" ","_") for x in cat_c]

In [15]:
num_c = ['Year', 'GDP Percent Growth', 'Population Percent Growth']
num_c = [x.lower() for x in num_c]
num_c = [x.replace(" ","_") for x in num_c]

In [24]:


# Assume pres_terms, pres_gdp, cat_c, num_c are already defined

# Step 1: Expand the 'Year' column for the president's terms
expanded_rows = []
for i, row in pres_terms.iterrows():
    for year in range(row['start'], row['stop'] + 1):
        row_data = row.to_dict()
        row_data['year'] = year
        expanded_rows.append(row_data)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Step 2: Merge the two DataFrames on the 'Year' column
expanded_df = pd.merge(expanded_df, pres_gdp, on="year", how="left")

# Step 3: Separate categorical and numerical columns
categorical_df = expanded_df[cat_c].set_index('year')
numerical_df = expanded_df[num_c].set_index('year')

# Step 4: Label encode the categorical columns (without one-hot encoding)
label_encoders = {}
for column in categorical_df.columns:
    le = LabelEncoder()
    categorical_df[column] = le.fit_transform(categorical_df[column])
    label_encoders[column] = le
    joblib.dump(le, f"{column}_encoder.pkl")

# Step 5: Merge categorical and numerical DataFrames
processed_df = pd.merge(categorical_df, numerical_df, left_index=True, right_index=True)

# Define which columns are categorical and which are continuous
categorical_columns = ['sign', 'vp', 'name']
continuous_columns = ['gdp_growth', 'population_growth', 'gdp_percent_growth', 'population_percent_growth']

# Separate target columns into categorical and continuous
y_categorical = processed_df[categorical_columns]
y_continuous = processed_df[continuous_columns]

# Define feature columns (X)
X = processed_df[['bday', 'bmonth', 'party']]

# Step 6: Perform non-stratified splitting using ShuffleSplit
split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(X):
    X_train_strat, X_test_strat = X.iloc[train_idx], X.iloc[test_idx]
    y_train_categorical_strat, y_test_categorical_strat = y_categorical.iloc[train_idx], y_categorical.iloc[test_idx]
    y_train_continuous_strat, y_test_continuous_strat = y_continuous.iloc[train_idx], y_continuous.iloc[test_idx]

# Step 7: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_strat)
X_test_scaled = scaler.transform(X_test_strat)

# Step 8: Define hyperparameters for RandomForestClassifier and RandomForestRegressor
param_grid_classifier = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

param_grid_regressor = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4]
}

# Step 9: Initialize RandomForest models for categorical and continuous targets
multi_output_classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
multi_output_regressor = MultiOutputRegressor(RandomForestRegressor(random_state=42))

# Step 10: Use ShuffleSplit for cross-validation
split_cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# Step 11: Define training functions that return the trained models
def train_classifier():
    # Hyperparameter search for RandomForestClassifier
    grid_search_classifier = RandomizedSearchCV(
        multi_output_classifier,
        param_distributions=param_grid_classifier,
        n_iter=10,
        cv=split_cv,  # ShuffleSplit for cross-validation
        verbose=2,
        n_jobs=-1
    )
    grid_search_classifier.fit(X_train_scaled, y_train_categorical_strat)
    print("Classifier training completed.")
    # Save the classifier model
    joblib.dump(grid_search_classifier.best_estimator_, 'multi_output_classifier.pkl')
    return grid_search_classifier

def train_regressor():
    # Hyperparameter search for RandomForestRegressor
    grid_search_regressor = RandomizedSearchCV(
        multi_output_regressor,
        param_distributions=param_grid_regressor,
        n_iter=10,
        cv=split_cv,  # ShuffleSplit for cross-validation
        verbose=2,
        n_jobs=-1
    )
    grid_search_regressor.fit(X_train_scaled, y_train_continuous_strat)
    print("Regressor training completed.")
    # Save the regressor model
    joblib.dump(grid_search_regressor.best_estimator_, 'multi_output_regressor.pkl')
    return grid_search_regressor

# Call the training functions and store the trained models
grid_search_classifier = train_classifier()
grid_search_regressor = train_regressor()

# Step 15: Evaluate the models
y_pred_categorical = grid_search_classifier.predict(X_test_scaled)
y_pred_continuous = grid_search_regressor.predict(X_test_scaled)

# Step 16: Accuracy and Error Calculation
accuracy_scores = {}
mse_scores = {}

# Evaluate categorical targets (classification)
for i, target in enumerate(categorical_columns):
    y_true = y_test_categorical_strat[target]
    y_pred_target = y_pred_categorical[:, i]
    accuracy = accuracy_score(y_true, y_pred_target)
    accuracy_scores[target] = accuracy
    print(f"Accuracy for {target}: {accuracy:.4f}")
    print(f"Classification report for {target}:\n", classification_report(y_true, y_pred_target, zero_division=0))

# Evaluate continuous targets (regression)
for i, target in enumerate(continuous_columns):
    y_true = y_test_continuous_strat[target]
    y_pred_target = y_pred_continuous[:, i]
    mse = mean_squared_error(y_true, y_pred_target)
    mse_scores[target] = mse
    print(f"Mean Squared Error for {target}: {mse:.4f}")

# Optional: You can print overall results for better summary
print("\nOverall Accuracy Scores for Categorical Targets:")
for target, accuracy in accuracy_scores.items():
    print(f"{target}: {accuracy:.4f}")

print("\nOverall Mean Squared Errors for Continuous Targets:")
for target, mse in mse_scores.items():
    print(f"{target}: {mse:.4f}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Classifier training completed.
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Regressor training completed.
Accuracy for sign: 1.0000
Classification report for sign:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00        13
           4       1.00      1.00      1.00         7
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00        16
           8       1.00      1.00      1.00         7
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        13
          11       1.00      1.00      1.00         2

    accuracy                           1.00       101
