In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import joblib

# Load your dataset
df = pd.read_csv('dummy_df-94.csv')

# Create the target columns for GDP Growth, President, and VP
df['GDP Growth Category'] = pd.cut(df['GDP Percent Growth'], bins=3, labels=['Low', 'Medium', 'High'])

# Convert categorical GDP Growth Category into numerical values
df['GDP Growth Category'] = df['GDP Growth Category'].cat.codes

# Prepare feature matrix X and target matrix y
# Here we drop the columns that are not needed for prediction
X = df.drop(columns=['GDP Growth Category', 'GDP Percent Growth', 'Population Percent Growth',
                     'Name_Abraham Lincoln', 'VP_John Adams'])  # Drop unnecessary President/VP columns

# y should include the GDP growth category and the President and VP columns (encoded as numerical)
y = df[['GDP Growth Category', 'Name_Abraham Lincoln', 'VP_John Adams']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model using MultiOutputClassifier with Logistic Regression
classifier = MultiOutputClassifier(LogisticRegression(max_iter=500))
classifier.fit(X_train_scaled, y_train)

# Save the trained model
# model_path = 'GDP_President_VP_model.pkl'
# joblib.dump(classifier, model_path)

# # Return the path to the saved model for download
# model_path



'GDP_President_VP_model.pkl'