In [1]:
import pandas as pd
import sklearn.metrics 

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

from sklearn.svm import SVC, LinearSVC, NuSVC

import scipy.stats as stats

In [2]:
df_train_mean_encoding = pd.read_csv('../Feature_Encoding/data/train_mean_encoding.csv')
df_test_mean_encoding = pd.read_csv('../Feature_Encoding/data/test_mean_encoding.csv')
df_train_binary_encoding = pd.read_csv('../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../Feature_Engineering/data/other-cleaned_train.csv')

In [3]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [4]:
def nusvc(x_train, y_train, x_validation, y_validation):

  nusvc = NuSVC(probability=True)
  grid_nusvc = GridSearchCV(nusvc, param_grid={'nu': [0.4, 0.5]}, cv=5)
  grid_nusvc.fit(x_train, y_train)
  grid_best = grid_nusvc.best_estimator_
  
  print(grid_nusvc.best_params_)
  print('nusvc score: {}'.format(grid_best.score(x_validation, y_validation)))

  return grid_best

In [5]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [6]:
y = train.Target
x_train_mean_encoding, x_validation_mean_encoding, y_train_mean_encoding, y_validation_mean_encoding = train_test_split(df_train_mean_encoding, y, test_size=0.3, stratify=y)
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

## Mean encoding

In [7]:
nusvc_mean = nusvc(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)

{'nu': 0.5}
nusvc score: 0.4136229022704837


In [8]:
test_model(nusvc_mean,x_validation_mean_encoding,y_validation_mean_encoding)

Accuracy: 56.62%, Logloss: 0.69


In [9]:
cross_val(nusvc_mean, x_train_mean_encoding, y_train_mean_encoding)

0.5309675460587039


## Binary Encoding

In [10]:
nusvc_binary = nusvc(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)

{'nu': 0.4}
nusvc score: 0.6309970384995064


In [11]:
test_model(nusvc_binary,x_validation_binary_encoding,y_validation_binary_encoding)

Accuracy: 56.47%, Logloss: 0.68


In [12]:
cross_val(nusvc_binary, x_train_binary_encoding, y_train_binary_encoding)

0.49230240173343554


In [14]:
y_pred = nusvc_mean.predict_proba(df_test_mean_encoding)[:,1]
submission_nusvc_mean = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred})
submission_nusvc_mean = submission_nusvc_mean.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_nusvc_mean.to_csv('submits/mean_nusvc.csv', index=False)

In [15]:
y_pred = nusvc_binary.predict_proba(df_test_binary_encoding)[:,1]
submission_nusvc_binary = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred})
submission_nusvc_binary = submission_nusvc_binary.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_nusvc_binary.to_csv('submits/binary_rf.csv', index=False)