In [25]:
import pandas as pd
import numpy as np
from functools import reduce
import math
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import random

### Train

In [70]:
df = pd.read_csv('../datasets/dataset_train.csv', index_col='Index')
df = df.dropna()

In [71]:
current_target = 'Hogwarts House'
y = df[current_target]
labelTransform = LabelEncoder()
labelTransform.fit(y)
y = labelTransform.transform(y)
labelTransform.inverse_transform(list(set(y)))

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'], dtype=object)

In [72]:
remove_feat = ['Arithmancy', 'Index', 'Astronomy', 'Potions', 'Care of Magical Creatures', 'Hogwarts House']
feat = list(set(df.describe().columns) - set(feat))

In [73]:
X = df[feat]
X = X.fillna(X.mean())
X = (X - X.mean()) / X.std()

In [74]:
class SlyLogRegression():

    def __init__(self, lr=0.01, weight = []):
        self.weight = weight
        self.n_epochs = 1000
        self.lr = lr
    
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def _get_selection(self, X, y, is_sgd):
        x_train = X
        y_train = y
        m = len(x_train)
        if(is_sgd):
            r = random.randint(0,len(y) - 1)
            x_train = X[r]
            y_train = y_train[r]
            m = 1
        return x_train, y_train, m
    
    def fit(self, X, y, is_sgd=False):
        X = np.array(X)
        X = np.insert(X, 0, 1, axis=1)
        theta = []
        for class_marker in np.unique(y):
            y_copy = np.where(y == class_marker, 1, 0)
            w = np.ones(X.shape[1])
            theta.append(w)
            for i in range(len(y)):
                x_train, y_train, m = self._get_selection(X, y_copy, is_sgd)
                
                hypothesis = self._sigmoid(x_train.dot(theta[class_marker]))
                loss = hypothesis - y_train
                gradient = np.dot(x_train.transpose(), loss) / m
                
                theta[class_marker] = theta[class_marker] - self.lr * gradient
        self.weight = theta
        
    def predict(self, X):
        return np.array(self._predict(X)[0])

    def predict_proba(self, X):
        return np.array(self._predict(X)[1])

    
    def _predict(self, X):
        result =  []
        result_pre = []
        X = np.array(X)
        X = np.insert(X, 0, 1, axis=1)
        if(len(self.weight) == 0):
            print('Weight is empty')
            return
        for i in range(len(X)):
            pre = []
            for j in range(len(self.weight)):
                pre.append(self._sigmoid(X[i].dot(self.weight[j])))
            result.append(pre.index(max(pre)))
            result_pre.append(pre / sum(pre))
        return result, result_pre
    
    def score(self, X, y):
        return sum(self.predict(X) == y) / len(y)


In [75]:
model = SlyLogRegression(lr=0.1)
model.fit(X, y, False)
model.score(X, y)

0.9832134292565947

In [76]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
clf = LogisticRegression()
clf.fit(trainX, trainy)
clf.score(X, y)

0.9832134292565947

#### SlyLogRegression

In [77]:
roc_auc_score(y, model.predict_proba(X), multi_class='ovr')

0.9889935180812705

#### ROC AUC LogisticRegression

In [78]:
roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')

0.9904339439058943

### Predict

In [79]:
df = pd.read_csv('../datasets/dataset_test.csv', index_col='Index')
remove_feat = ['Arithmancy', 'Index', 'Astronomy', 'Potions', 'Care of Magical Creatures', 'Hogwarts House']
feat = list(set(df.describe().columns) - set(remove_feat))
X = df[feat]
X = X.fillna(X.mean())
X = (X - X.mean()) / X.std()
X.head()

Unnamed: 0_level_0,Flying,Charms,Divination,History of Magic,Ancient Runes,Muggle Studies,Transfiguration,Herbology,Defense Against the Dark Arts
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,-0.388581,-0.14903,1.187244,0.3236512,-1.021526,-0.295044,0.354603,0.327907,-1.267591
1,-0.521374,1.362307,0.7607621,0.8527038,1.062428,1.51152,0.383924,0.316885,0.8379975
2,1.857234,-1.124339,0.3127872,-1.972064,0.915546,-0.834167,-2.112116,-1.518761,-0.5259718
3,-0.362632,-0.095734,-1.149947e-16,1.043128e-16,-0.678083,-0.466043,0.2798,0.71752,2.191005e-17
4,-0.320294,-0.09361,0.3819255,0.01092963,-1.095313,-0.467167,0.458259,0.471832,-0.462887


In [48]:
weight = pd.read_csv('../weight.csv', index_col="Index")
labelTransform = LabelEncoder()
labelTransform.fit(weight.columns)
weight.T

Index,0,1,2,3,4,5,6,7,8,9
Gryffindor,-1.013916,0.174033,-0.225937,-0.631153,-0.346494,0.739509,0.465944,-0.342762,1.831229,-0.776924
Hufflepuff,-0.641345,-0.732853,1.043831,0.871927,0.934603,-0.856899,0.830943,-0.830595,0.79512,-0.557092
Ravenclaw,-0.126013,1.627466,0.763418,0.579263,0.838942,1.426903,0.609968,1.450913,0.994281,1.065516
Slytherin,-1.346227,1.092801,0.729816,-0.710325,0.772636,-0.189916,-0.902062,-0.377365,0.639195,-0.732261


In [49]:
model = SlyLogRegression(lr=0.01, weight=np.array(weight.T))
result = labelTransform.inverse_transform(model.predict(X))
df_result = pd.DataFrame(result, columns=[current_target])
df_result.to_csv('./houses.csv', index_label="Index")

ValueError: shapes (6,) and (10,) not aligned: 6 (dim 0) != 10 (dim 0)