In [1]:
import pandas as pd 
import numpy as np 
from scipy.stats import chi2_contingency
from functions import logLoss
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

## Loading data

In [2]:
# load training data
X = pd.read_csv('../train_values_normalized.csv', index_col=0)
y = pd.read_csv('../train_labels.csv', index_col=0)

# load test data
X_test = pd.read_csv('../test_values_normalized.csv', index_col=0)

In [3]:
# adding polynomial features to the set

# only polynomials of the numerical features are included:
numeric_cols = ['resting_blood_pressure', 'num_major_vessels', 'fasting_blood_sugar_gt_120_mg_per_dl', 
                     'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age', 'max_heart_rate_achieved']

# the rest of the columns are categorical data:
categoric_data = X.copy()
categoric_data.drop(numeric_cols, axis=1, inplace=True)

# make 2nd degree poly features from numerical data
polynomials = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
poly_features = polynomials.fit_transform(X[numeric_cols])

# normalize data to mean=0 and std=1
scaler = StandardScaler(copy=False, with_mean=True, with_std=True).fit(poly_features)
poly_features = scaler.transform(poly_features)

# make data frame with all data
poly_train_df = pd.DataFrame(data=poly_features,
                             index=X.index,
                             columns=polynomials.get_feature_names(numeric_cols))
poly_train_df = poly_train_df.join(categoric_data)

In [4]:
# make poly df for test set
categoric_test = X_test.copy()
categoric_test.drop(numeric_cols, axis=1, inplace=True)
poly_test = polynomials.fit_transform(X_test[numeric_cols])
poly_test = scaler.transform(poly_test)
poly_test_df = pd.DataFrame(data=poly_test,
                           index=X_test.index,
                           columns=polynomials.get_feature_names(numeric_cols))
poly_test_df = poly_test_df.join(categoric_test)

In [5]:
# load original data
X_raw = pd.read_csv('../train_values.csv', index_col=0)
y_raw = pd.read_csv('../train_labels.csv', index_col=0)

# split data in train and crossval 70:30
X_train, X_test, y_train, y_test = train_test_split(X_raw, 
                y_raw, test_size=0.3, random_state=7)

## Feature selection: categorical features (using Pearson's chi-squared test)

In [6]:
# 1. sex vs heart disease

ct_table1 = pd.crosstab(y_train['heart_disease_present'], X_train['sex'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table1, correction=False)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table1

P is 0.000588
H0 rejected, no indepence. Use feature in model.
  
Contingency table:


sex,0,1
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33,39
1,9,45


In [7]:
# 2. exercise induced angina vs heart disease
ct_table2 = pd.crosstab(y_train['heart_disease_present'], X_train['exercise_induced_angina'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table2, correction=False)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table2

P is 0.000000
H0 rejected, no indepence. Use feature in model.
  
Contingency table:


exercise_induced_angina,0,1
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1
0,63,9
1,21,33


In [8]:
# 3. slope of peak exercise st segment vs heart disease
ct_table3 = pd.crosstab(y_train['heart_disease_present'], X_train['slope_of_peak_exercise_st_segment'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table3, correction=False)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table3

P is 0.000304
H0 rejected, no indepence. Use feature in model.
  
Contingency table:


slope_of_peak_exercise_st_segment,1,2,3
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,46,22,4
1,15,34,5


In [9]:
# 4. thal vs heart disease
ct_table4 = pd.crosstab(y_train['heart_disease_present'], X_train['thal'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table4, correction=True)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table4

P is 0.000000
H0 rejected, no indepence. Use feature in model.
  
Contingency table:


thal,fixed_defect,normal,reversible_defect
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,55,14
1,2,13,39


In [10]:
# 5. chest pain type vs heart disease
ct_table5 = pd.crosstab(y_train['heart_disease_present'], X_train['chest_pain_type'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table5, correction=True)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table5

P is 0.000000
H0 rejected, no indepence. Use feature in model.
  
Contingency table:


chest_pain_type,1,2,3,4
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6,18,32,16
1,2,2,8,42


In [11]:
# 6. resting EKG results vs heart disease
ct_table6 = pd.crosstab(y_train['heart_disease_present'], X_train['resting_ekg_results'], margins=False)
chi2, p, dof, ex = chi2_contingency(ct_table6, correction=True)
# print(chi2, p, dof, ex)

print( 'P is %f' %p )
if p <= 0.05:
    print('H0 rejected, no indepence. Use feature in model.')
else:
    print('H0 not rejected, indepence. Do not use feature in model.')

print('  ')
print('Contingency table:')
ct_table6

P is 0.298733
H0 not rejected, indepence. Do not use feature in model.
  
Contingency table:


resting_ekg_results,0,1,2
heart_disease_present,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,38,0,34
1,23,1,30


In [12]:
# deleting Resting EKG results from datasets
EKG_cols = ['restingEkg_0', 'restingEkg_1', 'restingEkg_2']
poly_train_df.drop(EKG_cols, axis=1, inplace=True)
poly_test_df.drop(EKG_cols, axis=1, inplace=True)

Index([u'1', u'resting_blood_pressure', u'num_major_vessels',
       u'fasting_blood_sugar_gt_120_mg_per_dl', u'serum_cholesterol_mg_per_dl',
       u'oldpeak_eq_st_depression', u'age', u'max_heart_rate_achieved',
       u'resting_blood_pressure^2',
       u'resting_blood_pressure num_major_vessels',
       u'resting_blood_pressure fasting_blood_sugar_gt_120_mg_per_dl',
       u'resting_blood_pressure serum_cholesterol_mg_per_dl',
       u'resting_blood_pressure oldpeak_eq_st_depression',
       u'resting_blood_pressure age',
       u'resting_blood_pressure max_heart_rate_achieved',
       u'num_major_vessels^2',
       u'num_major_vessels fasting_blood_sugar_gt_120_mg_per_dl',
       u'num_major_vessels serum_cholesterol_mg_per_dl',
       u'num_major_vessels oldpeak_eq_st_depression', u'num_major_vessels age',
       u'num_major_vessels max_heart_rate_achieved',
       u'fasting_blood_sugar_gt_120_mg_per_dl^2',
       u'fasting_blood_sugar_gt_120_mg_per_dl serum_cholesterol_mg_per_dl

## Feature selection: numerical features (using ... )

In [13]:
# load training data
X = np.genfromtxt('../train_values_normalized.csv', delimiter=",", skip_header=1)
X = X[:,1:]
y = np.genfromtxt('../train_labels.csv',  delimiter=",", skip_header=1)
y = y[:,1:]

# load test data
X_test = np.genfromtxt('../test_values_normalized.csv',  delimiter=",", skip_header=1)
X_test = X_test[:,1:]