In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os.path import exists
import json
import scipy.stats as stats
import math
from sklearn.model_selection import train_test_split
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier as rf
import sklearn.model_selection as ms
import sklearn.metrics as sm
import sys
sys.path.insert(0, "../src/")
import util as util
%autosave 5

In [None]:
np.random.seed(51)

In [None]:
df = pd.read_csv("../Data/Raw/ACME-HappinessSurvey2020.csv")

if exists("../Data/happinessTrain.csv"):
    happinessTrain = pd.read_csv("../Data/Processed/happinessTrain.csv")
    happinessTest = pd.read_csv("../Data/Processed/happinessTest.csv")
    yTrain = happinessTrain[["Y"]].values.ravel()
    XTrain = happinessTrain.drop("Y",axis=1)
    yTest = happinessTest[["Y"]].values.ravel()
    XTest = happinessTest.drop("Y",axis=1)
else:
    y = df[["Y"]]
    X = df.drop("Y",axis=1)
    XTrain,XTest,yTrain,yTest = train_test_split(X, y, test_size=0.15,random_state=51)
    happinessTrain = XTrain.copy()
    happinessTrain['Y'] = yTrain
    happinessTrain.to_csv('../Data/happinessTrain.csv',index=False)
    happinessTest = XTest.copy()
    happinessTest['Y'] = yTest
    happinessTest.to_csv('../Data/happinessTest.csv',index=False)
    yTrain = yTrain.values.ravel()
    yTest = yTest.values.ravel()

# Attempt 1: Using a broad search param

In [None]:
midpoint = int(round((2 + XTrain.shape[1])/2))
searchParams = {
    "n_estimators": [100,200,300],
    "max_depth":[2,midpoint,XTrain.shape[1]],
    "max_features":[2,midpoint,XTrain.shape[1]]
}


In [None]:
model = rf()
modelGridSearch = ms.GridSearchCV(model, param_grid=searchParams)
modelGridSearch.fit(XTrain,yTrain)
modelParams1 = modelGridSearch.best_params_
print(modelParams1)
model1 = rf(n_estimators = modelParams1["n_estimators"],
                max_depth = modelParams1["max_depth"],
                max_features = modelParams1["max_features"])
model1.fit(XTrain,yTrain)
yPredict = model1.predict(XTrain)
print(f"Training Score {sm.accuracy_score(yTrain,yPredict)}")

## Validating model 1 with the test set.

In [None]:
yPredictTest = model1.predict(XTest)
print(f"Testing Score {sm.accuracy_score(yTest,yPredictTest)}")

# Attempt 2: Refined search

In [None]:
print(midpoint)

In [None]:
searchParams2 = {
    "n_estimators": [50,100,150,200],
    "max_depth":[2,3,4],
    "max_features":[2,3,4]
}

modelGridSearch2 = ms.GridSearchCV(rf(), param_grid=searchParams2)
modelGridSearch2.fit(XTrain,yTrain)
modelParams2 = modelGridSearch2.best_params_
print(modelParams2)
model2 = rf(n_estimators = modelParams2["n_estimators"],
                max_depth = modelParams2["max_depth"],
                max_features = modelParams2["max_features"])
model2.fit(XTrain,yTrain)
yPredict = model2.predict(XTrain)
print(f"Training Score {sm.accuracy_score(yTrain,yPredict)}")

## Validating model 2 with the test set.

In [None]:
yPredictTest = model2.predict(XTest)
print(f"Testing Score {sm.accuracy_score(yTest,yPredictTest)}")

# Attempt 3

In [None]:
searchParams3 = {
    "n_estimators": [25,50,75,100,125,150,175,200,225],
    "max_depth":[2,3,4,5,6],
    "max_features":[2,3,4,5,6]
}

model3 = util.fitWithGridSearch(searchParams3,XTrain,yTrain)

## Validating model 3 with test set.

In [None]:
yPredictTest = model3.predict(XTest)
print(f"Training Score {sm.accuracy_score(yTest,yPredictTest)}")

In [None]:
util.displayFeatureImportances(list(XTest.columns),model3,"Model V3")