In [63]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Step 1: Load the dataset into a pandas dataframe
df = pd.read_csv('/Users/deadrienhill/Downloads/superheroes_nlp_dataset.csv')

# Step 2: Data cleaning and preprocessing
# Remove missing values
df.dropna(inplace=True)

# Convert the 'strength_score' column to float
df['strength_score'] = df['strength_score'].astype(float)

# Step 3: Split the dataset into training, validation, and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Step 4: Convert the text data into numerical vectors using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train = tfidf_vectorizer.fit_transform(train_df['history_text'])
y_train = train_df['strength_score'].values
X_val = tfidf_vectorizer.transform(val_df['history_text'])
y_val = val_df['strength_score'].values

# Step 5: Train and evaluate different machine learning models
# Logistic Regression
lr_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('lr', LogisticRegression())
])
lr_pipeline.fit(train_df['history_text'], train_df['strength_score'])
lr_preds = lr_pipeline.predict(val_df['history_text'])
lr_mse = mean_squared_error(val_df['strength_score'], lr_preds)
print("Logistic Regression MSE: ", lr_mse)

# Random Forest Regressor
rf_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('rf', RandomForestRegressor())
])
rf_pipeline.fit(train_df['history_text'], train_df['strength_score'])
rf_preds = rf_pipeline.predict(val_df['history_text'])
rf_mse = mean_squared_error(val_df['strength_score'], rf_preds)
print("Random Forest MSE: ", rf_mse)

# Neural Network
nn_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('nn', MLPRegressor())
])
nn_pipeline.fit(train_df['history_text'], train_df['strength_score'])
nn_preds = nn_pipeline.predict(val_df['history_text'])
nn_mse = mean_squared_error(val_df['strength_score'], nn_preds)
print("Neural Network MSE: ", nn_mse)

# Step 6: Fine-tune the best performing model using hyperparameter tuning
# Fine-tune the Random Forest Regressor using Grid Search
rf_params = {
    'rf__n_estimators': [10, 50, 100],
    'rf__max_depth': [None, 10]}


Logistic Regression MSE:  3612.5
Random Forest MSE:  1385.7031249999998
Neural Network MSE:  1651.7807083077566


