In [1]:
import pandas as pd
import numpy as np
from functions import helper
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# Model Metrics
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import f1_score, make_scorer, adjusted_rand_score, precision_recall_curve, rand_score, accuracy_score
from scipy.stats import multivariate_normal, sem

In [2]:
df = pd.read_csv('data/IST_corrected_clean.csv',  index_col= [0])

In [3]:

df.index = pd.RangeIndex(len(df.index))

In [4]:
df = df[df['DDEAD'] != 'Y']

In [5]:
def target(row):
    if row.OCCODE == 'Recovered':
        return 0
    elif row.OCCODE in ['Not recovered', 'Dependent', 'Dead']:
        return 1
    else:
        return np.nan

In [6]:
# Create the new target column
df['NOREC'] = df.apply(lambda row: target(row), axis=1)

In [7]:
# drop missing values from 'RATRIAL'
df = df.dropna(subset = ['RATRIAL'])

In [8]:
# convert NCB14 and STRK14 into categoricals
df['NCB14'] = df.NCB14.astype('object')
df['STRK14'] = df.STRK14.astype('object')

# Select features
features = ['SEX', 'AGE', 'RSBP', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'NCB14', 'STRK14', 'RATRIAL']
X = df[features]

# Select target NOREC
y = df.NOREC

In [9]:
# Split the data in test and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
#convert all 'objects' into 'categories'
X_train = pd.concat([
        X_train.select_dtypes([], ['object']),
        X_train.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(X_train.columns, axis=1)

In [11]:
#convert all 'objects' into 'categories' 
X_test = pd.concat([
        X_test.select_dtypes([], ['object']),
        X_test.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(X_test.columns, axis=1)

In [12]:
sme = SMOTEENN(random_state= 42)

In [13]:
# encoder for categories
onehot = OneHotEncoder(drop="first")

# scalers for numerical features
mms = MinMaxScaler()
ss = StandardScaler()

# Prepare list of numerical and categorical columns
num_cols = make_column_selector(dtype_include=np.number)
cat_cols = make_column_selector(dtype_include='category')

In [14]:
# Encode categorical features (with more than two classes)
X_train_trans = X_train.copy()
X_test_trans = X_test.copy()
for cat in ['SEX', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7', 'RATRIAL', 'NCB14', 'STRK14']:
    X_train_trans[[cat]] = X_train_trans[cat].cat.codes
    X_test_trans[[cat]] = X_test_trans[cat].cat.codes

In [15]:
dtree_w = DecisionTreeClassifier(random_state = 42, class_weight= 'balanced')
rf_w = RandomForestClassifier(random_state = 42, class_weight= 'balanced')
#dtree_w_pipe = make_pipeline(sme, dtree_w)

In [16]:
dtree_w.fit(X_train_trans, y_train)
y_pred = dtree_w.predict(X_test_trans)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
print(classification_report(y_test, y_pred))
print (roc_auc_score(y_test, y_pred))

Misclassified samples: 1471
              precision    recall  f1-score   support

           0       0.24      0.28      0.26       921
           1       0.82      0.79      0.81      3862

    accuracy                           0.69      4783
   macro avg       0.53      0.53      0.53      4783
weighted avg       0.71      0.69      0.70      4783

0.5342158710023498


In [17]:
logreg = LogisticRegression(max_iter=400, random_state= 42)

In [18]:
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', logreg))
	level0.append(('rf', rf_w))
	level0.append(('dtree', dtree_w))
	level1 = LogisticRegression(random_state= 42)
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

In [20]:
def get_models():
	models = dict()
	models['logreg'] = logreg
	models['rf'] = rf_w
	models['dtree'] = dtree_w
	models['stacking'] = get_stacking()
	return models

In [22]:
models = get_models()

In [23]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = helper.evaluate_model(model, X_train_trans, y_train)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

Average Score: 0.992 with an Standard Error of 0.0005
>logreg 0.992 (0.004)
Average Score: 0.888 with an Standard Error of 0.0014
>rf 0.888 (0.010)
Average Score: 0.782 with an Standard Error of 0.0019
>dtree 0.782 (0.014)
Average Score: 0.989 with an Standard Error of 0.0006
>stacking 0.989 (0.004)


In [24]:
model.fit(X_train_trans, y_train)
y_pred = model.predict(y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: Expected 2D array, got 1D array instead:
array=[1 0 1 ... 1 1 1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.