In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import GradientBoostingClassifier as grad
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
import scipy.stats as stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sys
sys.path.insert(0, "../util/")
import util as util

%autosave 5

# Adding dummies and train-test-split

In [None]:
datingTrainExists = exists('../data/processedData/datingTrain.csv')
datingTestExists = exists('../data/processedData/datingTest.csv')
datingFullExists = exists('../data/processedData/datingFull.csv')

relatedDummiesDictionaryExists = exists('../data/processedData/relatedDummiesDictionary.json')

In [None]:
if exists('../data/processedData/columnDataDictionary.json'):
    with open('../data/processedData/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
else:
    with open('../data/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
columnList = columnDataDictionary['columnList']
nonBinaryCategoricalList = columnDataDictionary['nonBinaryCategoricalList']
stringToFloatList = columnDataDictionary['stringToFloatList']
pointDistributionList = columnDataDictionary['pointDistributionList']
partnerList = columnDataDictionary['partnerList']

In [None]:
if (datingTrainExists and datingTestExists and datingFullExists and relatedDummiesDictionaryExists):
    datingTrain = pd.read_csv('../data/processedData/datingTrain.csv')
    datingTest = pd.read_csv('../data/processedData/datingTest.csv')
    datingFull = pd.read_csv('../data/processedData/datingFull.csv')
    
    with open('../data/processedData/relatedDummiesDictionary.json') as d:
        relatedDummiesDictionary = json.load(d)
    for df in [datingTrain,datingTest,datingFull]:
        df['zipcode'] = df['zipcode'].apply(str)
        if 'zipcode_o' in list(df.columns):
            df['zipcode_o'] = df['zipcode_o'].apply(str)
        for col in nonBinaryCategoricalList:
            if col in list(df.columns):
                df[col] = df[col].apply(str)
        
else:
    !rm -r ../data/processedData
    !mkdir ../data/processedData
    
    datingData = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')   
    
    blindDateData = datingData[columnList]
    
    for col in stringToFloatList:
        blindDateData[col] = blindDateData[col].str.replace(',', '').astype(float)
    
    blindDateData['zipcode'] = blindDateData['zipcode'].apply(str)
    blindDateData['zipcode'] = blindDateData['zipcode'].str.replace(',', '')
    
    for col in nonBinaryCategoricalList:
        blindDateData[col] = blindDateData[col].apply(str)
    
    blindDateCategoricalData = blindDateData.select_dtypes(include=['O'])
    for col in blindDateCategoricalData.columns:
        blindDateData[col]=blindDateData[col].fillna('nan')
    relatedDummiesDictionary = {}
    for col in blindDateCategoricalData.columns:
        dummyData = pd.get_dummies(blindDateData[col],prefix=col,drop_first=True)
        if len(dummyData.columns) <= 25:
            for dummyCol in dummyData.columns:
                relatedDummiesDictionary[str(dummyCol)] = list(dummyData.columns)
                if col in partnerList:
                    partnerList.append(str(dummyCol))
                    partnerDummies = [partnerdummy+"_o" for partnerdummy in list(dummyData.columns)]
                    relatedDummiesDictionary[str(dummyCol)+"_o"] = partnerDummies
            blindDateData = pd.concat([blindDateData,dummyData],axis=1)
    with open('../data/processedData/relatedDummiesDictionary.json', 'w') as fp:
        json.dump(relatedDummiesDictionary, fp)
        
    partnerList = list(set(partnerList))
    columnDataDictionary = {"columnList": columnList,
                        "nonBinaryCategoricalList": nonBinaryCategoricalList,
                        "stringToFloatList": stringToFloatList,
                        "pointDistributionList": pointDistributionList,
                        "partnerList": partnerList}

    with open('../data/processedData/columnDataDictionary.json', 'w') as fp:
            json.dump(columnDataDictionary, fp)
    
    datingFull = blindDateData.copy()
    match = datingFull['match']
    X = datingFull.drop(['match'], axis=1)
    
    datingTrain, datingTest, matchTrain, matchTest = train_test_split(X, match, test_size=0.2)
    
    datingTrain['match'] = matchTrain
    datingTest['match'] = matchTest
    
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)
    datingFull.to_csv('../data/processedData/datingFull.csv',index=False)
    
    dummyColumns = list(relatedDummiesDictionary.keys())

In [None]:
match = datingTrain["match"]
matchTest = datingTest["match"]
datingTrain = datingTrain.drop("match",axis=1)
datingTest = datingTest.drop("match",axis=1)

## Data Cleaning

### Join partner

In [None]:
if ("iid" in datingTrain.columns) or ("pid" in datingTrain.columns):
    partner = datingFull.copy()
    datingTrain = util.joinToPartner(datingTrain,partner).drop(["iid","pid","iid_o","pid_o"],axis=1)
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Get distance

In [None]:
if "partnerDistance" not in datingTrain.columns:
    datingTrain = util.returnDFWithpartnerDistance(datingTrain,"train",True)
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Fix ambiguous scores

In [None]:
halfwayChangeColumns = [str(col) for col in datingTrain.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingTrain = util.fixAmbiguousScores(datingTrain)
    util.halfwayQuestionSanityTest(datingTrain," post-fixAmbiguousScores and pre-saving")
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Replace Nans

In [None]:
datingTrainNumerical = datingTrain.select_dtypes(include=['uint8','int64','float64'])

if (exists('../data/processedData/trainNanReplacementValuesDictionary.json')):
    with open('../data/processedData/trainNanReplacementValuesDictionary.json') as d:
        trainNanReplacementValuesDictionary = json.load(d)
else:
    trainNanReplacementValuesDictionary = {}
    for col in datingTrainNumerical:   
        if len(list(set(datingTrainNumerical[col]))) <=30:
            setValues = [val for val in list(set(datingTrainNumerical[col])) if ((np.isnan(val) == False) and np.isfinite(val))]
            setValues = pd.Series(setValues)
            trainNanReplacementValuesDictionary[str(col)] = round(np.mean(setValues.mode().values))
        else:
            trainNanReplacementValuesDictionary[str(col)] = datingTrainNumerical[col].mean()
    with open('../data/processedData/trainNanReplacementValuesDictionary.json', 'w') as fp:
        json.dump(trainNanReplacementValuesDictionary, fp)

In [None]:
X = util.replaceNansWithTrainingDataValues(datingTrainNumerical)

# Training

In [None]:
sqrtn = int(np.sqrt(X.shape[0]))
sqrtfeatures = int(np.sqrt(X.shape[1]))
log2features = int(np.log2(X.shape[1]))
midpoint = int((sqrtfeatures + log2features)/2)

## Logistic Regression

In [None]:
logModel = lm.LogisticRegression(max_iter=1e9)
logPipe = make_pipeline(StandardScaler(), logModel)
logPipe.fit(X,match)

## KNN

In [None]:
knn5 = knn(n_neighbors=5)
knnsqrtn = knn(n_neighbors=sqrtn)
knn5.fit(X,match)
knnsqrtn.fit(X,match)

## Boosting

In [None]:
gradientdeci = grad(learning_rate=0.1)
gradientdeka = grad(learning_rate=10)

gradientdeci.fit(X,match)
gradientdeka.fit(X,match)

## Random Forest Classifier

In [None]:
if exists("../data/processedData/forestParams.json"):
    with open('../data/processedData/forestParams.json') as d:
        forestParams = json.load(d)
        preciseForestParams = forestParams["preciseForestParams"]
        recallForestParams = forestParams["recallForestParams"]
else:
    print(f"sqrt(nFeatures) ~ {sqrtfeatures}")
    print(f"log2(nFeatures) ~ {log2features}")
    print(f"midpoint = {midpoint}")
    searchParams = {
        "criterion": ["gini","entropy","log_loss"],
        "n_estimators": [100,200,300],
        "max_depth":[sqrtfeatures,midpoint,log2features,None],
        "max_features":[sqrtfeatures,midpoint,log2features,None]
    }

    preciseForest0 = rf()
    recallForest0 = rf()
    
    try:
        recallForestGrid = ms.GridSearchCV(recallForest0, param_grid=searchParams, scoring='recall',n_jobs=5)
        recallForestGrid.fit(X,match)
        recallForestParams = recallForestGrid.best_params_
        print("recall params:")
        print(recallForestParams)
        
        for key in searchParams.keys():
            searchParams[key] = [val for val in searchParams[key] if val != recallForestParams[key]]
        
        preciseForestGrid = ms.GridSearchCV(accurateForest0, param_grid=searchParams, scoring='precision',n_jobs=5)
        preciseForestGrid.fit(X,match)
        preciseForestParams = accurateForestGrid.best_params_
        print("precision params:")
        print(preciseForestParams)
        
        
        forestParams = {
            "preciseForestParams": preciseForestParams,
            "recallForestParams": recallForestParams
        }
        with open("../data/processedData/forestParams.json", 'w') as fp:
            json.dump(forestParams, fp)
    except BaseException:
        util.displayValueExceptionColumn(X)
    
if exists("../data/processedData/forestParams.json"):
    preciseForest = rf(n_estimators = preciseForestParams["n_estimators"],
                    criterion = preciseForestParams["criterion"],
                    max_depth = preciseForestParams["max_depth"],
                    max_features = preciseForestParams["max_features"])
    recallForest = rf(n_estimators = recallForestParams["n_estimators"],
                    criterion = recallForestParams["criterion"],
                    max_depth = recallForestParams["max_depth"],
                    max_features = recallForestParams["max_features"])

preciseForest.fit(X,match)
recallForest.fit(X,match)

# Top 10 Feature Importances

In [None]:
XColumns = list(X.columns)

## Logistic Regression

In [None]:
coefficients = [coef for coef in logPipe.named_steps['logisticregression'].coef_.reshape(-1,)]
absCoefficients = [absCoef for absCoef in np.abs(np.array(coefficients))]

logImportances = pd.DataFrame({
    "feature": XColumns,
    "coefficients": coefficients,
    "absCoefficients": absCoefficients},columns = ["feature","coefficients","absCoefficients"])
logImportancesSorted = logImportances.sort_values(by="absCoefficients", ascending=False)
print(f'logistic regression top 10 feature importances')
for i in range(10):
    featureRow = logImportancesSorted.iloc[i]
    feature = featureRow['feature']
    featureValue = featureRow['coefficients']
    print(f'Rank {i}: {feature}: score: {featureValue}')

## KNN focuses on nearest neighbors, not on specific features

## Gradient Boosting & Random Forest Classifier

In [None]:
models = [gradientdeci,gradientdeka,preciseForest,recallForest]
modelNames = ["gradientdeci","gradientdeka","preciseForest","recallForest"]

for i in range(4):
    util.displayFeatureImportances(XColumns,models[i],modelNames[i])

# Individual Testing

In [None]:
if ("iid" in datingTest.columns) or ("pid" in datingTest.columns):
    partner = datingFull.copy()
    datingTest = util.joinToPartner(datingTest,partner).drop(["iid","pid","iid_o","pid_o"],axis=1)
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)

if "partnerDistance" not in datingTest.columns:
    datingTest = util.returnDFWithpartnerDistance(datingTest,"test",True)
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)

halfwayChangeColumns = [str(col) for col in datingTest.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingTest = util.fixAmbiguousScores(datingTest)
    util.halfwayQuestionSanityTest(datingTest," post-fixAmbiguousScores and pre-saving")
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)

datingTestNumerical = datingTest.select_dtypes(include=['uint8','int64','float64'])
XTest = util.replaceNansWithTrainingDataValues(datingTestNumerical)

## Logistic Regression

In [None]:
yPredict = logPipe.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"logPipe")

## KNN

In [None]:
knn5.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"knn5")
knnsqrtn.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"knnsqrtn")

## Boosting

In [None]:
gradientdeci.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"gradientdeci")
gradientdeka.predict(XTest)
util.displayScores(matchTest,yPredict,"gradientdeka")

## Random Forest Classifier

In [None]:
preciseForest.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"preciseForest")
recallForest.predict(XTest)
util.displayMetricScores(matchTest,yPredict,"recallForest")

# Ensemble Testing

In [None]:
logModel = lm.LogisticRegression(max_iter=1e9)
logPipe = make_pipeline(StandardScaler(), logModel)
knn5 = knn(n_neighbors=5)
knnsqrtn = knn(n_neighbors=sqrtn)
gradientdeci = grad(learning_rate=0.1)
gradientdeka = grad(learning_rate=10)
preciseForest = rf(n_estimators = preciseForestParams["n_estimators"],
                    criterion = preciseForestParams["criterion"],
                    max_depth = preciseForestParams["max_depth"],
                    max_features = preciseForestParams["max_features"])
recallForest = rf(n_estimators = recallForestParams["n_estimators"],
                  criterion = recallForestParams["criterion"],
                  max_depth = recallForestParams["max_depth"],
                  max_features = recallForestParams["max_features"])

ensembleVote = VotingClassifier(
    estimators = [
        ("logModel",logPipe),
        ("knn5",knn5),
        ("knnsqrtn",knnsqrtn),
        ("gradientdeci",gradientdeci),
        ("gradientdeka",gradientdeka),
        ("preciseForest",preciseForest),
        ("recallForest",recallForest)
    ]
)

ensembleVote.fit(X,match)
ensembleDecision = ensemble.predict(XTest)
util.displayMetricScores(matchTest,ensembleDecision,"Ensemble")