In [4]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.linear_model as LM
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
import sys
sys.path.append('..')
from utils.features import FeatureEngineering as FE
from utils.model_eval import ModelEvaluation

training_data = pd.read_csv('../data/train.csv')
featureEngineer_data = pd.read_csv('../data/train.csv')

modelClass = ModelEvaluation()
FEClass = FE()
FEClass.run_feature_engineering(featureEngineer_data, ratios=False, scaling ='standard', decade = False)
features = featureEngineer_data[featureEngineer_data.columns.drop('Lead')]

## The model with given features

In [6]:
# Model Given Features
attributes = training_data.loc[:,:'Age Co-Lead']
lead = training_data['Lead']
modelGivenFeatures = LM.LogisticRegression(max_iter = 5000)
# Cross Validation
lead_binary = lead.replace(["Male", "Female"],[1,0])
modelClass.cross_val(modelGivenFeatures, attributes, lead_binary, 5) # cross_val function all the same in the group

----------- Cross-validation report -----------

Model: LogisticRegression(max_iter=5000)

Feature set: ['Number words female', 'Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Year', 'Number of female actors', 'Number words male', 'Gross', 'Mean Age Male', 'Mean Age Female', 'Age Lead', 'Age Co-Lead']

Number of folds: 5

Performance:
- Accuracy: 0.871 (avg), 0.846 (min), 0.909 (max)
- Accuracy, 95.0 % confidence interval: 0.777-0.965
- Accuracy, female: 0.618 (avg), 0.529 (min), 0.700 (max)
- Accuracy, male: 0.953 (avg), 0.924 (min), 0.994 (max)
- Training accuracy: 0.879 (avg), 0.871 (min), 0.884 (max)
---------------------------------------------



## The model with given initial selection features

In [7]:
# Model Initial Selection Features
attributesIS = training_data.iloc[:,[1,2,3,4,6,11,12]]
lead = training_data['Lead']
modelInitialSelection = LM.LogisticRegression(max_iter = 5000)
# Cross Validation 
lead_binary = lead.replace(["Male", "Female"],[1,0])
modelClass.cross_val(modelInitialSelection, attributesIS, lead_binary, 5) # cross_val function all the same in the group  

----------- Cross-validation report -----------

Model: LogisticRegression(max_iter=5000)

Feature set: ['Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Number of female actors', 'Age Lead', 'Age Co-Lead']

Number of folds: 5

Performance:
- Accuracy: 0.807 (avg), 0.797 (min), 0.822 (max)
- Accuracy, 95.0 % confidence interval: 0.713-0.902
- Accuracy, female: 0.437 (avg), 0.353 (min), 0.529 (max)
- Accuracy, male: 0.927 (avg), 0.885 (min), 0.975 (max)
- Training accuracy: 0.804 (avg), 0.800 (min), 0.809 (max)
---------------------------------------------



## Find the optimal features using RFE

In [8]:
# Feature engineering
modelBest = LM.LogisticRegression(max_iter = 5000)
lead = training_data['Lead']
lead_binary = lead.replace(["Male", "Female"],[1,0])

print("Features Before: ", features.shape[1])
modelClass.cross_val(modelBest, features, lead_binary, 5) # cross_val function all the same in the group  
#RFE
modelRFE = LM.LogisticRegression(max_iter = 5000)
featuresToSelect = 14 # Found through trial and error
rfe = RFE(estimator = modelRFE, n_features_to_select = featuresToSelect)
rfe = rfe.fit(features, lead_binary)
bestFeatures = []
attributesIndex = range(0,len(list(features)))
for f, i in zip(list(rfe.ranking_), attributesIndex):
    if f == 1: # Add the highest ranked features to the newFeatures
        bestFeatures.append(i) 
print("Features After : ", len(bestFeatures))
newFeatures = features.iloc[:,bestFeatures]
modelClass.cross_val(modelRFE, newFeatures, lead_binary, 5) # cross_val function all the same in the group

Features Before:  48
----------- Cross-validation report -----------

Model: LogisticRegression(max_iter=5000)

Feature set: ['Number words female', 'Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Year', 'Number of female actors', 'Number words male', 'Gross', 'Mean Age Male', 'Mean Age Female', 'Age Lead', 'Age Co-Lead', 'Total actors', 'Female word share', 'Male word share', 'Lead word share', 'Female actor share', 'Male actor share', 'Decade', 'Difference Words Gender', 'Difference Actors', 'Difference Age Lead', 'Difference Mean Age', 'Yearly mean Number words female', 'Yearly mean Total words', 'Yearly mean Number of words lead', 'Yearly mean Difference in words lead and co-lead', 'Yearly mean Number of male actors', 'Yearly mean Number of female actors', 'Yearly mean Number words male', 'Yearly mean Gross', 'Yearly mean Mean Age Male', 'Yearly mean Mean Age Female', 'Yearly mean Age Lead', 'Yearly mean Age Co-Lead', 'Yearly

## Hyper parameterize with Grid-Search

In [9]:
# Grid Search
modelGrid = LM.LogisticRegression(max_iter=5000)
grid_parameters = [
                    {'penalty'  : ['l1'], 'C': np.logspace(-4,4,20), 'solver' : ['liblinear', 'saga'] },
                    {'penalty'  : ['l2'], 'C': np.logspace(-4,4,20), 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
                    {'penalty'  :['none'], 'solver' : ['newton-cg', 'lbfgs', 'sag', 'saga']},
                    {'penalty'  :['elasticnet'], 'l1_ratio' : np.linspace(0,1,10), 'C': np.logspace(-4,4,20), 'solver' : ['saga']}
                   ]

grid_search = GridSearchCV(modelGrid, param_grid = grid_parameters, n_jobs=-1,scoring = 'accuracy', verbose = 1)
lead_binary = lead.replace(["Male", "Female"],[1,0])
grid_search.fit(newFeatures, lead_binary) # Fit with new features
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
modelBestGrid = LM.LogisticRegression(**grid_search.best_params_, max_iter = 5000)
modelClass.cross_val(modelBestGrid, newFeatures, lead_binary, 5) # cross_val function all the same in the group

Fitting 5 folds for each of 344 candidates, totalling 1720 fits
Best Parameters:  {'C': 0.23357214690901212, 'l1_ratio': 0.1111111111111111, 'penalty': 'elasticnet', 'solver': 'saga'}
Best Score:  0.8845178372352285
----------- Cross-validation report -----------

Model: LogisticRegression(C=0.23357214690901212, l1_ratio=0.1111111111111111,
                   max_iter=5000, penalty='elasticnet', solver='saga')

Feature set: ['Number of words lead', 'Difference in words lead and co-lead', 'Year', 'Number of female actors', 'Number words male', 'Female word share', 'Male word share', 'Lead word share', 'Female actor share', 'Male actor share', 'Decade', 'Difference Age Lead', 'Yearly mean diff Number of words lead', 'Yearly mean diff Difference in words lead and co-lead']

Number of folds: 5

Performance:
- Accuracy: 0.885 (avg), 0.865 (min), 0.899 (max)
- Accuracy, 95.0 % confidence interval: 0.790-0.979
- Accuracy, female: 0.666 (avg), 0.588 (min), 0.720 (max)
- Accuracy, male: 0.955 (