In [169]:
import pandas as pd
import numpy as np
from functools import reduce
import math
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import random

### Train

In [154]:
df = pd.read_csv('../datasets/dataset_train.csv', index_col='Index')
df = df.dropna()

In [155]:
current_target = 'Hogwarts House'
y = df[current_target]
labelTransform = LabelEncoder()
labelTransform.fit(y)
y = labelTransform.transform(y)
labelTransform.inverse_transform(list(set(y)))

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'], dtype=object)

In [156]:
X = df[df.describe().columns[1:]]
X = X.fillna(X.mean())
X = (X - X.mean()) / X.std()

In [205]:
class SlyLogRegression():

    def __init__(self, lr=0.01, weight = []):
        self.weight = weight
        self.n_epochs = 1000
        self.lr = lr
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def _get_selection(self, X, y, is_sgd):
        x_train = X
        y_train = y
        m = len(x_train)
        if(is_sgd):
            r = random.randint(0,len(y) - 1)
            x_train = X[r]
            y_train = y_train[r]
            m = 1
        return x_train, y_train, m
    
    def fit(self, X, y, is_sgd=False):
        X = np.array(X)
        X = np.insert(X, 0, 1, axis=1)
        theta = []
        for class_marker in np.unique(y):
            y_copy = np.where(y == class_marker, 1, 0)
            w = np.ones(X.shape[1])
            theta.append(w)
            for i in range(len(y)):
                x_train, y_train, m = self._get_selection(X, y_copy, is_sgd)
                
                hypothesis = self._sigmoid(x_train.dot(theta[class_marker]))
                loss = hypothesis - y_train
                gradient = np.dot(x_train.transpose(), loss) / m
                
                theta[class_marker] = theta[class_marker] - self.lr * gradient
        self.weight = theta
        
    def predict(self, X):
        return np.array(self._predict(X)[0])

    def predict_proba(self, X):
        return np.array(self._predict(X)[1])

    
    def _predict(self, X):
        result =  []
        result_pre = []
        X = np.array(X)
        X = np.insert(X, 0, 1, axis=1)
        if(len(self.weight) == 0):
            print('Weight is empty')
            return
        for i in range(len(X)):
            pre = []
            for j in range(len(self.weight)):
                pre.append(self._sigmoid(X[i].dot(self.weight[j])))
            result.append(pre.index(max(pre)))
            result_pre.append(pre / sum(pre))
        return result, result_pre
    
    def score(self, X, y):
        return sum(self.predict(X) == y) / len(y)


In [206]:
model = SlyLogRegression(lr=0.01)
model.fit(X, y, False)
model.score(X, y)

0.9832134292565947

In [211]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
clf = LogisticRegression()
clf.fit(trainX, trainy)
clf.score(X, y)

0.9832134292565947

#### SlyLogRegression

In [223]:
roc_auc_score(y, model.predict_proba(X), multi_class='ovr')

0.9897478168091359

#### ROC AUC LogisticRegression

In [224]:
roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')

0.9911577170489928

### Predict

In [143]:
df = pd.read_csv('../datasets/dataset_test.csv', index_col='Index')
X = df[df.describe().columns[1:]]
X = X.fillna(X.mean())
X = (X - X.mean()) / X.std()

In [149]:
weight = pd.read_csv('../weight.csv', index_col="Index")
labelTransform = LabelEncoder()
labelTransform.fit(weight.columns)
weight.T

Index,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Gryffindor,-1.100667,0.671923,1.378658,-0.69578,0.621342,0.239029,-0.030084,0.895555,-0.299387,-0.422988,0.402949,0.251041,-0.554311,1.938086
Hufflepuff,-0.596891,0.7179,2.091792,0.828909,-0.091792,0.512309,-0.555747,-0.818885,1.142993,1.050035,0.428912,0.368041,-0.230791,0.717838
Ravenclaw,-0.238662,0.882371,0.310169,0.615931,1.689831,0.609028,1.612565,1.44928,0.689266,0.704814,0.881036,0.504771,1.145174,1.115164
Slytherin,-1.439121,0.583325,0.671351,-0.666822,1.328649,-1.007131,-0.221659,-0.257548,0.546792,0.602542,1.053743,0.220933,-0.623142,0.794925


In [148]:
model = SlyLogRegression(lr=0.01, weight=np.array(weight.T))
result = labelTransform.inverse_transform(model.predict(X))
df_result = pd.DataFrame(result, columns=[current_target])
df_result.to_csv('./houses.csv', index_label="Index")