In [4]:
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score  # Include accuracy_score for evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import joblib


# Step 1: Import CSV files
pres_gdp = pd.read_csv("president_gdp-94.csv")
pres_terms = pd.read_csv("president_terms-94.csv", encoding="UTF-8")

# Step 2: Expand the 'Year' column for the president's terms
expanded_rows = []
for i, row in pres_terms.iterrows():
    for year in range(row['Start'], row['Stop']):
        row_data = row.to_dict()  # Convert row to a dictionary
        row_data['Year'] = year   # Add the expanded year to the dictionary
        expanded_rows.append(row_data)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Step 3: Merge the two DataFrames on the 'Year' column
expanded_df = pd.merge(expanded_df, pres_gdp, on="Year")

# Step 4: Separate categorical and numerical columns
categorical_df = expanded_df[['Year', 'Name', 'VP', 'BDay', 'Party', 'sign', 'BMonth', 'GDP Growth', 'Population Growth']].set_index('Year')
numerical_df = expanded_df[['Year', 'GDP Percent Growth', 'Population Percent Growth']].set_index('Year')

# Step 5: Label encode the categorical columns (without one-hot encoding)
label_encoders = {}
for column in categorical_df.columns:
    le = LabelEncoder()
    categorical_df[column] = le.fit_transform(categorical_df[column])
    label_encoders[column] = le
    # Save the label encoders as .pkl files
    joblib.dump(le, f"{column}_encoder.pkl")  # Save each encoder to a .pkl file


# Step 6: Merge categorical and numerical DataFrames
processed_df = pd.merge(categorical_df, numerical_df, left_index=True, right_index=True)


# Define which columns are categorical and which are continuous
categorical_columns = ['sign', 'VP', 'Name']
continuous_columns = ['GDP Growth', 'Population Growth', 'GDP Percent Growth', 'Population Percent Growth']

# Separate target columns into categorical and continuous
y_categorical = processed_df[categorical_columns]
y_continuous = processed_df[continuous_columns]

# Define feature columns (X)
X = processed_df[['BDay', 'BMonth', 'Party']]

# Split data into train and test sets
X_train, X_test, y_train_categorical, y_test_categorical = train_test_split(X, y_categorical, test_size=0.2, random_state=42)
_, _, y_train_continuous, y_test_continuous = train_test_split(X, y_continuous, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models for categorical and continuous targets
multi_output_classifier = MultiOutputClassifier(LogisticRegression())
multi_output_regressor = MultiOutputRegressor(RandomForestRegressor())

# Train the models
multi_output_classifier.fit(X_train_scaled, y_train_categorical)
multi_output_regressor.fit(X_train_scaled, y_train_continuous)

# Make predictions on test data
y_pred_categorical = multi_output_classifier.predict(X_test_scaled)
y_pred_continuous = multi_output_regressor.predict(X_test_scaled)

# Initialize a dictionary to store the accuracy scores for each target column
accuracy_scores = {}

# Loop through each categorical target column and calculate the accuracy
for i, target in enumerate(categorical_columns):
    print(f"Evaluating model for {target}...")
    
    # Get the true values and predicted values for the current target
    y_true = y_test_categorical[target]
    y_pred_target = y_pred_categorical[:, i]
    
    # Calculate accuracy score
    accuracy = accuracy_score(y_true, y_pred_target)
    accuracy_scores[target] = accuracy
    
    # Print classification report (for categorical outputs like 'sign', 'VP', and 'Name')
    print(classification_report(y_true, y_pred_target))

# Loop through each continuous target column and calculate the mean squared error
for i, target in enumerate(continuous_columns):
    print(f"Evaluating model for {target}...")
    
    # Get the true values and predicted values for the current target
    y_true = y_test_continuous[target]
    y_pred_target = y_pred_continuous[:, i]
    
    # Calculate mean squared error
    mse = mean_squared_error(y_true, y_pred_target)
    print(f"Mean Squared Error for {target}: {mse:.4f}")

# Print overall accuracy scores for categorical targets
print("Accuracy Scores for each categorical target:")
for target, score in accuracy_scores.items():
    print(f"{target}: {score:.4f}")

# Save the models
joblib.dump(multi_output_classifier, 'multi_output_classifier.pkl')
joblib.dump(multi_output_regressor, 'multi_output_regressor.pkl')

Evaluating model for sign...
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.33      0.20      0.25         5
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         4
           4       0.80      1.00      0.89         4
           5       1.00      1.00      1.00         4
           6       0.71      1.00      0.83         5
           7       0.41      1.00      0.58         7
           8       0.00      0.00      0.00         1
           9       0.44      0.50      0.47         8
          10       1.00      0.25      0.40         4
          11       0.00      0.00      0.00         1

    accuracy                           0.56        48
   macro avg       0.43      0.44      0.40        48
weighted avg       0.51      0.56      0.49        48

Evaluating model for VP...
              precision    recall  f1-score   support

           0       0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

['multi_output_regressor.pkl']