In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os.path import exists
import json
import scipy.stats as stats
import math
from sklearn.model_selection import train_test_split
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model as lm
from sklearn.tree import DecisionTreeClassifier as tree
from sklearn.neighbors import KNeighborsClassifier as knn
from xgboost import XGBClassifier as xgb
from sklearn.ensemble import VotingClassifier
import sklearn.model_selection as ms
import sklearn.metrics as sm
%autosave 5

In [None]:
def fitModelWithGridSearch(searchParams,XTrain,yTrain,modelType):
    if modelType == 'tree':
        gridmodel = tree(random_state=51)
        finalmodel = tree(random_state=51)
    elif modelType == 'forest':
        gridmodel = rf(random_state=51)
        finalmodel = rf(random_state=51)
    elif modelType == 'knn':
        gridmodel = knn()
        finalmodel = knn()
    elif modelType == 'xgboost':
        gridmodel = xgb(random_state=51)
        finalmodel = xgb(random_state=51)

In [None]:
def fitTreeWithGridSearch(searchParams,XTrain,yTrain):
    modelGridSearch = ms.GridSearchCV(tree(), param_grid=searchParams)
    modelGridSearch.fit(XTrain,yTrain)
    modelParams = modelGridSearch.best_params_
    model = tree(max_depth = modelParams["max_depth"],
                max_features = modelParams["max_features"])
    model.fit(XTrain,yTrain)
    yPredict = model.predict(XTrain)
    return model

In [None]:
def fitForestWithGridSearch(searchParams,XTrain,yTrain):
    modelGridSearch = ms.GridSearchCV(rf(random_state=51), param_grid=searchParams)
    modelGridSearch.fit(XTrain,yTrain)
    modelParams = modelGridSearch.best_params_
    model = rf(n_estimators = modelParams["n_estimators"],
                max_depth = modelParams["max_depth"],
                max_features = modelParams["max_features"])
    model.fit(XTrain,yTrain)
    yPredict = model.predict(XTrain)
    return model

In [None]:
def makeData():
    df = pd.read_csv("../Data/Raw/ACME-HappinessSurvey2020.csv")
    y = df[["Y"]]
    X = df.drop("Y",axis=1)
    XTrain,XTest,yTrain,yTest = train_test_split(X, y, test_size=0.15,random_state=51)
    happinessTrain = XTrain.copy()
    happinessTrain['Y'] = yTrain
    happinessTrain.to_csv('../Data/happinessTrain.csv',index=False)
    happinessTest = XTest.copy()
    happinessTest['Y'] = yTest
    happinessTest.to_csv('../Data/happinessTest.csv',index=False)
    yTrain = yTrain.values.ravel()
    yTest = yTest.values.ravel()
    return XTrain,XTest,yTrain,yTest

In [None]:
def loadData():
    if exists("../Data/happinessTrain.csv"):
        happinessTrain = pd.read_csv("../Data/Processed/happinessTrain.csv")
        happinessTest = pd.read_csv("../Data/Processed/happinessTest.csv")
        yTrain = happinessTrain[["Y"]].values.ravel()
        XTrain = happinessTrain.drop("Y",axis=1)
        yTest = happinessTest[["Y"]].values.ravel()
        XTest = happinessTest.drop("Y",axis=1)
    else:
        XTrain,XTest,yTrain,yTest = makeData()
    return XTrain,XTest,yTrain,yTest

In [None]:
def main():
    np.random.seed(51)
    XTrain,XTest,yTrain,yTest = loadData()
    
    initialTreeParams = {
        "max_depth":[2,3,4],
        "max_features":[2,3,4]
    }
    searchParams4 = {
        "n_estimators": [100,200,300],
        "max_depth":[2,3,4],
        "max_features":[2,3,4]
    }
    
    

In [None]:
main()

# Attempt 4: Voting Classifier with Standard Scalar Logistic Regression, best tree, and best forest

In [None]:
initialTreeParams = {
    "max_depth":[2,3,4],
    "max_features":[2,3,4]
}
searchParams4 = {
    "n_estimators": [100,200,300],
    "max_depth":[2,3,4],
    "max_features":[2,3,4]
}

logModel = lm.LogisticRegression(max_iter=1e9)
logPipe = make_pipeline(StandardScaler(), logModel)

treeGridSearch = ms.GridSearchCV(tree(), param_grid=initialTreeParams)
treeGridSearch.fit(XTrain,yTrain)
treeParams4 = treeGridSearch.best_params_
print(f"treeParams: {treeParams4}")
tree4 = tree(max_depth = treeParams4["max_depth"],
             max_features = treeParams4["max_features"])

forestGridSearch4 = ms.GridSearchCV(rf(), param_grid=searchParams4)
forestGridSearch4.fit(XTrain,yTrain)
forestParams4 = forestGridSearch4.best_params_
print(f"forestParams: {forestParams4}")
forest4 = rf(n_estimators = forestParams4["n_estimators"],
             max_depth = forestParams4["max_depth"],
             max_features = forestParams4["max_features"])

ensembleVote = VotingClassifier(
    estimators = [
        ("logModel",logPipe),
        ("tree",tree4),
        ("forest",forest4)
    ]
)

ensembleVote.fit(XTrain,yTrain)
ensembleDecision = ensembleVote.predict(XTrain)
print(f"Training Score {sm.accuracy_score(yTrain,ensembleDecision)}")

# Attempt 5: Adding alternative tree and alternative forest to classifier

In [None]:
treeParams5 = {
    "max_depth": [d for d in initialTreeParams["max_depth"] if d != treeParams4["max_depth"]],
    "max_features":[f for f in initialTreeParams["max_features"] if f != treeParams4["max_features"]]
}

searchParams5 = {
    "n_estimators": [n  for n in searchParams4["n_estimators"] if n != forestParams4["n_estimators"]],
    "max_depth":[d for d in searchParams4["max_depth"] if d != forestParams4["max_depth"]],
    "max_features":[f for f in searchParams4["max_features"] if f != forestParams4["max_features"]]
}

tree5 = util.fitTreeWithGridSearch(treeParams5,XTrain,yTrain)

forest5 = util.fitForestWithGridSearch(searchParams5,XTrain,yTrain)

ensembleVote = VotingClassifier(
    estimators = [
        ("logModel",logPipe),
        ("tree4",tree4),
        ("forest4",forest4),
        ("tree5",tree5),
        ("forest5",forest5)
    ]
)

ensembleVote.fit(XTrain,yTrain)
ensembleDecision = ensembleVote.predict(XTrain)
print(f"Training Score {sm.accuracy_score(yTrain,ensembleDecision)}")

## Validating model 5 with the test set.

In [None]:
yPredictTest = ensembleVote.predict(XTest)
print(f"Testing Score {sm.accuracy_score(yTest,yPredictTest)}")