In [20]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import pandas as pd
import numpy as np
import copy
from sklearn.utils import resample
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, LeaveOneGroupOut, LeaveOneOut
from sklearn.metrics import r2_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import log_loss

train_values = np.loadtxt("train_values.csv", delimiter=",", dtype=object)
train_labels = np.loadtxt("train_labels.csv", delimiter=",", dtype=object)

d = np.empty(15, dtype=object)

d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9],d[10],d[11],d[12],d[13] = np.hsplit(train_values, 14)
d[0],d[14] = np.hsplit(train_labels, 2)

atributes = {}

for n,obj in enumerate(d):
    atributes[n] = obj[0][0]
    d[n] = np.squeeze(d[n])[1:]

#NOTE: the columns lable in pd.df is numeric. For a given column N, its property corresponds to atributes[N+1]
data = pd.DataFrame(data=d[1],index=d[0])
for n in range(15):
    if n < 2:
        continue
    data[n-1] = d[n]

#### The 14 features are describedbelow:

- slope_of_peak_exercise_st_segment (type: int): the slope of the peak exercise ST segment, an electrocardiography read out indicating quality of blood flow to the heart
- thal (type: categorical): results of thallium stress test measuring blood flow to the heart, with possible values normal, fixed_defect, reversible_defect
- resting_blood_pressure (type: int): resting blood pressure
- chest_pain_type (type: int): chest pain type (4 values)
- num_major_vessels (type: int): number of major vessels (0-3) colored by flourosopy
- fasting_blood_sugar_gt_120_mg_per_dl (type: binary): fasting blood sugar > 120 mg/dl
- resting_ekg_results (type: int): resting electrocardiographic results (values 0,1,2)
- serum_cholesterol_mg_per_dl (type: int): serum cholestoral in mg/dl
- oldpeak_eq_st_depression (type: float): oldpeak = ST depression induced by exercise relative to rest, a measure of abnormality in electrocardiograms
- sex (type: binary): 0: female, 1: male
- age (type: int): age in years
- max_heart_rate_achieved (type: int): maximum heart rate achieved (beats per minute)
- exercise_induced_angina (type: binary): exercise-induced chest pain (0: False, 1: True)
- heart_disease_present (type: binary): 0: heart disease not present, 1: heart disease present 

NOTE: The index in data corresponds to patient ID

In [21]:
print(atributes)
data.head()

{0: 'patient_id', 1: 'slope_of_peak_exercise_st_segment', 2: 'thal', 3: 'resting_blood_pressure', 4: 'chest_pain_type', 5: 'num_major_vessels', 6: 'fasting_blood_sugar_gt_120_mg_per_dl', 7: 'resting_ekg_results', 8: 'serum_cholesterol_mg_per_dl', 9: 'oldpeak_eq_st_depression', 10: 'sex', 11: 'age', 12: 'max_heart_rate_achieved', 13: 'exercise_induced_angina', 14: 'heart_disease_present'}


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


In [22]:
# Counts for heart_disease_present in training data
data[13].value_counts()

0    100
1     80
Name: 13, dtype: int64

In [102]:
data = (data.astype({0: 'float32', 1: 'category', 2: 'float32', 3: 'float32', 4: 'float32', 5: 'float32', 6: 'float32', 7: 'float32',
                    8: 'float32', 9: 'float32', 10: 'float32', 11: 'float32', 12: 'float32', 13: 'int32'}))
print(data.dtypes)

# Average for each attribute whether or not heart_disease_present
data.groupby(13).mean()

0      float32
1     category
2      float32
3      float32
4      float32
5      float32
6      float32
7      float32
8      float32
9      float32
10     float32
11     float32
12     float32
13       int32
dtype: object


Unnamed: 0_level_0,0,2,3,4,5,6,7,8,9,10,11,12
13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1.36,130.119995,2.81,0.33,0.16,0.92,245.460007,0.627,0.55,53.66,156.869995,0.13
1,1.7875,132.800003,3.5875,1.15,0.1625,1.2125,253.899994,1.48875,0.8625,56.25,140.25,0.55


In [103]:
# Averages as separated by thalium stress test
data.groupby(1).mean()

Unnamed: 0_level_0,0,2,3,4,5,6,7,8,9,10,11,12,13
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2.0,141.375,3.125,0.625,0.375,1.25,227.25,1.3,1.0,57.875,136.0,0.25,0.5
1,1.377551,129.775513,2.897959,0.530612,0.153061,1.071429,250.255096,0.669388,0.510204,54.387756,154.938782,0.153061,0.204082
2,1.72973,132.25676,3.5,0.918919,0.148649,1.0,250.202698,1.42973,0.891892,55.040539,143.716217,0.540541,0.756757


In [104]:
category_col = data.select_dtypes(['category']).columns 
data[category_col] = data[category_col].apply(lambda x: x.cat.codes)
print(data.head())

         0   1      2    3    4    5    6      7    8    9     10     11   12  \
0z64un  1.0   1  128.0  2.0  0.0  0.0  2.0  308.0  0.0  1.0  45.0  170.0  0.0   
ryoo3j  2.0   1  110.0  3.0  0.0  0.0  0.0  214.0  1.6  0.0  54.0  158.0  0.0   
yt1s1x  1.0   1  125.0  4.0  3.0  0.0  2.0  304.0  0.0  1.0  77.0  162.0  1.0   
l2xjde  1.0   2  152.0  4.0  0.0  0.0  0.0  223.0  0.0  1.0  40.0  181.0  0.0   
oyt4ek  3.0   2  178.0  1.0  0.0  0.0  2.0  270.0  4.2  1.0  59.0  145.0  0.0   

        13  
0z64un   0  
ryoo3j   0  
yt1s1x   1  
l2xjde   1  
oyt4ek   0  


In [168]:
thal = {0:'fixed_defect', 1:'normal', 2:'reversible_defect'}
data.groupby(13).mean()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1.36,1.14,130.119995,2.81,0.33,0.16,0.92,245.460007,0.627,0.55,53.66,156.869995,0.13
1,1.7875,1.65,132.800003,3.5875,1.15,0.1625,1.2125,253.899994,1.48875,0.8625,56.25,140.25,0.55


In [173]:
X = data.loc[:, data.columns != 13]
y = np.array(data.loc[:, data.columns == 13]).reshape(180,)
print(X.head())
X.shape

         0   1      2    3    4    5    6      7    8    9     10     11   12
0z64un  1.0   1  128.0  2.0  0.0  0.0  2.0  308.0  0.0  1.0  45.0  170.0  0.0
ryoo3j  2.0   1  110.0  3.0  0.0  0.0  0.0  214.0  1.6  0.0  54.0  158.0  0.0
yt1s1x  1.0   1  125.0  4.0  3.0  0.0  2.0  304.0  0.0  1.0  77.0  162.0  1.0
l2xjde  1.0   2  152.0  4.0  0.0  0.0  0.0  223.0  0.0  1.0  40.0  181.0  0.0
oyt4ek  3.0   2  178.0  1.0  0.0  0.0  2.0  270.0  4.2  1.0  59.0  145.0  0.0


(180, 13)

In [174]:
sm = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
columns = X_train.columns
sm_data_X, sm_data_y = sm.fit_sample(X_train, y_train)
sm_data_X = pd.DataFrame(data = sm_data_X, columns = columns )
sm_data_y= pd.DataFrame(sm_data_y) #IDK what you are trying to do here with the oversampling, i think you are trying to counter the uneven dist?

scl = StandardScaler()
scale = scl.fit(X_train)

  return self.partial_fit(X, y)


In [175]:
#WITHOUT data Normalization
logreg = LogisticRegression(multi_class = 'multinomial', solver='saga', penalty='l1', max_iter=10000, C=1.0)

# fit the model with taining data
logreg.fit(X_train,y_train) #there was a major dataleak here! make sure you dont test using the same data u used to train
y_pred=logreg.predict(X_test)
y_pred_prob = logreg.predict_proba(X_test)
print(f'Accuracy of logistic regression classifier on test set: {logreg.score(X_test, y_test)}')
lloss = log_loss(y_test, y_pred_prob)
print(f'log loss: {lloss}')

Accuracy of logistic regression classifier on test set: 0.8333333333333334
log loss: 0.3536860722404389


###### LogisticRegression(multi_class = 'multinomial', solver='saga', penalty='l1', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8333333333333334
    log loss: 0.3536771729189414
###### LogisticRegression(multi_class = 'multinomial', solver='saga', penalty='l2', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8055555555555556
    log loss: 0.36027304512739483
###### LogisticRegression(multi_class = 'multinomial', solver='sag', penalty='l2', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8333333333333334
    log loss: 0.3585651405396726
###### LogisticRegression(multi_class = 'multinomial', solver='lbfgs', penalty='l2', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8333333333333334
    log loss: 0.42470468568912534
###### LogisticRegression(multi_class = 'multinomial', solver='newton-cg', penalty='l2', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8333333333333334
    log loss: 0.42600636188623087
###### LogisticRegression(multi_class = 'ovr', solver='liblinear', penalty='l2', max_iter=10000)
    Accuracy of logistic regression classifier on test set: 0.8611111111111112
    log loss: 0.3770715033919158

In [176]:
#WITH data Normalization
logreg = LogisticRegression(multi_class = 'multinomial', solver='saga', penalty='l1', max_iter=10000, C=0.2)

# fit the model with data
logreg.fit(scl.transform(X_train),y_train)
y_pred=logreg.predict(scl.transform((X_test)))
y_pred_prob = logreg.predict_proba(scl.transform(X_test))
print(f'Accuracy of logistic regression classifier on test set: {logreg.score(scl.transform(X_test), y_test)}')
lloss = log_loss(y_test, y_pred_prob)
print(f'log loss: {lloss}')

Accuracy of logistic regression classifier on test set: 0.8333333333333334
log loss: 0.3488839397385522


  """
  
  import sys
  
