In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import GradientBoostingClassifier as grad
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn import metrics
import scipy.stats as stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sys
sys.path.insert(0, "../util/")
import util as util

%autosave 5

Autosaving every 5 seconds


# Adding dummies and train-test-split

In [2]:
datingTrainExists = exists('../data/processedData/datingTrain.csv')
datingTestExists = exists('../data/processedData/datingTest.csv')
datingFullExists = exists('../data/processedData/datingFull.csv')

relatedDummiesDictionaryExists = exists('../data/processedData/relatedDummiesDictionary.json')

In [3]:
if exists('../data/processedData/columnDataDictionary.json'):
    with open('../data/processedData/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
else:
    with open('../data/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
columnList = columnDataDictionary['columnList']
nonBinaryCategoricalList = columnDataDictionary['nonBinaryCategoricalList']
stringToFloatList = columnDataDictionary['stringToFloatList']
pointDistributionList = columnDataDictionary['pointDistributionList']
partnerList = columnDataDictionary['partnerList']

In [4]:
if (datingTrainExists and datingTestExists and datingFullExists and relatedDummiesDictionaryExists):
    datingTrain = pd.read_csv('../data/processedData/datingTrain.csv')
    datingTest = pd.read_csv('../data/processedData/datingTest.csv')
    datingFull = pd.read_csv('../data/processedData/datingFull.csv')
    
    with open('../data/processedData/relatedDummiesDictionary.json') as d:
        relatedDummiesDictionary = json.load(d)
    for df in [datingTrain,datingTest,datingFull]:
        df['zipcode'] = df['zipcode'].apply(str)
        if 'zipcode_o' in list(df.columns):
            df['zipcode_o'] = df['zipcode_o'].apply(str)
        for col in nonBinaryCategoricalList:
            if col in list(df.columns):
                df[col] = df[col].apply(str)
        
else:
    !rm -r ../data/processedData
    !mkdir ../data/processedData
    
    datingData = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')   
    
    blindDateData = datingData[columnList]
    
    for col in stringToFloatList:
        blindDateData[col] = blindDateData[col].str.replace(',', '').astype(float)
    
    blindDateData['zipcode'] = blindDateData['zipcode'].apply(str)
    blindDateData['zipcode'] = blindDateData['zipcode'].str.replace(',', '')
    
    for col in nonBinaryCategoricalList:
        blindDateData[col] = blindDateData[col].apply(str)
    
    blindDateCategoricalData = blindDateData.select_dtypes(include=['O'])
    for col in blindDateCategoricalData.columns:
        blindDateData[col]=blindDateData[col].fillna('nan')
    relatedDummiesDictionary = {}
    for col in blindDateCategoricalData.columns:
        dummyData = pd.get_dummies(blindDateData[col],prefix=col,drop_first=True)
        if len(dummyData.columns) <= 25:
            for dummyCol in dummyData.columns:
                relatedDummiesDictionary[str(dummyCol)] = list(dummyData.columns)
                if col in partnerList:
                    partnerList.append(str(dummyCol))
                    partnerDummies = [partnerdummy+"_o" for partnerdummy in list(dummyData.columns)]
                    relatedDummiesDictionary[str(dummyCol)+"_o"] = partnerDummies
            blindDateData = pd.concat([blindDateData,dummyData],axis=1)
    with open('../data/processedData/relatedDummiesDictionary.json', 'w') as fp:
        json.dump(relatedDummiesDictionary, fp)
        
    partnerList = list(set(partnerList))
    columnDataDictionary = {"columnList": columnList,
                        "nonBinaryCategoricalList": nonBinaryCategoricalList,
                        "stringToFloatList": stringToFloatList,
                        "pointDistributionList": pointDistributionList,
                        "partnerList": partnerList}

    with open('../data/processedData/columnDataDictionary.json', 'w') as fp:
            json.dump(columnDataDictionary, fp)
    
    datingFull = blindDateData.copy()
    match = datingFull['match']
    X = datingFull.drop(['match'], axis=1)
    
    datingTrain, datingTest, matchTrain, matchTest = train_test_split(X, match, test_size=0.2)
    
    datingTrain['match'] = matchTrain
    datingTest['match'] = matchTest
    
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)
    datingFull.to_csv('../data/processedData/datingFull.csv',index=False)
    
    dummyColumns = list(relatedDummiesDictionary.keys())

## Data Cleaning

### Join partner

In [5]:
if ("iid" in datingTrain.columns) or ("pid" in datingTrain.columns):
    partner = datingFull.copy()
    datingTrain = util.joinToPartner(datingTrain,partner).drop(["iid","pid","iid_o","pid_o"],axis=1)
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Get distance

In [6]:
if "partnerDistance" not in datingTrain.columns:
    datingTrain = util.returnDFWithpartnerDistance(datingTrain,"train",True)
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Fix ambiguous scores

In [7]:
halfwayChangeColumns = [str(col) for col in datingTrain.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingTrain = util.fixAmbiguousScores(datingTrain)
    util.halfwayQuestionSanityTest(datingTrain," post-fixAmbiguousScores and pre-saving")
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Replace Nans

In [8]:
datingTrainNumerical = datingTrain.select_dtypes(include=['uint8','int64','float64']).drop(['match'],axis = 1)

if (exists('../data/processedData/trainNanReplacementValuesDictionary.json')):
    with open('../data/processedData/trainNanReplacementValuesDictionary.json') as d:
        trainNanReplacementValuesDictionary = json.load(d)
else:
    trainNanReplacementValuesDictionary = {}
    for col in datingTrainNumerical:   
        if len(list(set(datingTrainNumerical[col]))) <=30:
            setValues = [val for val in list(set(datingTrainNumerical[col])) if ((np.isnan(val) == False) and np.isfinite(val))]
            setValues = pd.Series(setValues)
            trainNanReplacementValuesDictionary[str(col)] = round(np.mean(setValues.mode().values))
        else:
            trainNanReplacementValuesDictionary[str(col)] = datingTrainNumerical[col].mean()
    with open('../data/processedData/trainNanReplacementValuesDictionary.json', 'w') as fp:
        json.dump(trainNanReplacementValuesDictionary, fp)

In [9]:
datingTrainNumerical = util.replaceNansWithTrainingDataValues(datingTrainNumerical)

### Final Assignment

In [10]:
X = datingTrainNumerical
match = datingTrain["match"]

# Training

In [11]:
sqrtn = np.floor(np.sqrt(X.shape[0]))
sqrtfeatures = np.floor(np.sqrt(X.shape[1]))
log2features = np.floor(np.log2(X.shape[1]))

## Logistic Regression

In [12]:
logModel = lm.LogisticRegression(max_iter=1e9)
logPipe = make_pipeline(StandardScaler(), logModel)
try:
    logPipe.fit(X,match)
except BaseException:
    util.displayValueExceptionColumn(X)

## KNN

In [13]:
knn5 = knn(n_neighbors=5)
knnsqrtn = knn(n_neighbors=sqrtn)
try:
    knn5.fit(X,match)
    knnsqrtn.fit(X,match)
except BaseException:
    util.displayValueExceptionColumn(X)

## Boosting

In [14]:
gradientdeci = grad(learning_rate=0.1)
gradientdeka = grad(learning_rate=10)

try:
    gradientdeci.fit(X,match)
    gradientdeka.fit(X,match)
except BaseException:
    util.displayValueExceptionColumn(X)

## Random Forest Classifier

In [None]:
if exists("../data/processedData/forestParams.json"):
    with open('../data/processedData/forestParams.json') as d:
        forestParams = json.load(d)
        accurateForestParams = forestParams["accurateForestParams"]
        recallForestParams = forestParams["recallForestParams"]
else:
    n_estimator_list = [int(n) for n in np.linspace(100,1000,11)]
    searchParams = {
        "criterion": ["gini","entropy","log_loss"],
        "n_estimators": n_estimator_list,
        "max_depth":[sqrtfeatures,log2features,None],
        "max_features":["sqrt","log2",None]
    }

    accurateForest0 = rf()
    recallForest0 = rf()

    accurateForestGrid = ms.GridSearchCV(accurateForest0, param_grid=searchParams, scoring='accuracy')
    recallForestGrid = ms.GridSearchCV(recallForest0, param_grid=searchParams, scoring='recall')
    
    try:
        accurateForestGrid.fit(X,match)
        recallForestGrid.fit(X,match)

        accurateForestParams = accurateForestGrid.best_params_
        recallForestParams = recallForestGrid.best_params_
        
        redoRecall = True
        for param in accurateForestParams.keys():
            if accurateForestParams[param] != recallForestParams[param]:
                redoRecall = False
                
        if redoRecall:
            print("Redo Recall!")
            searchParams["n_estimators"] = [n for n in n_estimator_list if n != accurateForestParams["n_estimators"]]
            recallForestGrid = ms.GridSearchCV(recallForest0, param_grid=searchParams, scoring='recall')
            recallForestParams = recallForestGrid.best_params_
        
        print(accurateForestParams)
        print(recallForestParams)
        
        forestParams = {
            "accurateForestParams": accurateForestParams,
            "recallForestParams": recallForestParams
        }
        with open("../data/processedData/forestParams.json", 'w') as fp:
            json.dump(forestParams, fp)
    except BaseException:
        util.displayValueExceptionColumn(X)
    
if exists("../data/processedData/forestParams.json"):
    accurateForest = rf(n_estimators = accurateForestParams["n_estimators"],
                    criterion = accurateForestParams["criterion"],
                    max_depth = accurateForestParams["max_depth"],
                    max_features = accurateForestParams["max_features"])
    recallForest = rf(n_estimators = recallForestParams["n_estimators"],
                    criterion = recallForestParams["criterion"],
                    max_depth = recallForestParams["max_depth"],
                    max_features = recallForestParams["max_features"])

    try:
        accurateForest.fit(X,match)
        recallForest.fit(X,match)
    except BaseException:
        util.displayValueExceptionColumn(X)

# Individual Testing

## Logistic Regression

## KNN

## Boosting

## Random Forest Classifier

# Ensemble Testing