In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder




In [None]:
random.seed(0)

In [None]:
## loading the file
health_care = pd.read_csv('../healthcare/train_data.csv', na_values=['NA', 'NaN', '?'])

In [None]:
# Dropping na values for 
#health_care = health_care.dropna()


## Feature engineering

We will add 2 features : 
- Visit number
- New Patient

In [None]:
n = health_care.shape[0]
visits = []
patientid = np.array(health_care['patientid'])
for i in range(n) :
    count = (patientid[:i+1] == patientid[i]).sum()
    visits.append(count)
    
health_care['Visit Number'] = visits
health_care['New Patient'] = (health_care['Visit Number']==1).astype('int32')

## Hot encoded Data

In [None]:
# Function to hot encode the column with name : name for dataframe df
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True) ## inplace to make changed on the original df

In [None]:
## Pre processing these columns

le_string_columns = [  'Severity of Illness', 'Age' ]
dummies_string_columns = ['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code',  'Type of Admission']
encoder = LabelEncoder()
#ohEncoder = OneHotEncoder()

## label_encoded data
Xe = health_care.drop(['Stay', 'case_id', 'City_Code_Hospital','City_Code_Patient', 'Bed Grade'], axis=1)
Xe['Severity of Illness']= Xe['Severity of Illness'].map({'Minor': 0,  'Moderate': 1,'Extreme': 2})
Xe['Age'] = encoder.fit_transform(Xe['Age'])

ye = health_care['Stay']
ye = encoder.fit_transform(ye)


for column in dummies_string_columns :
    encode_text_dummy(Xe, column)
    

In [None]:
#checking for missing values
Xe.isnull().values.any()

## Scaling Hot Encoded Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = Xe.values
y = ye
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
for i in range(X_train.shape[1]) :
    X_train[:,i]= scaler.fit_transform(X_train[:,i].reshape(-1, 1))[:,0]
    X_test[:,i] = scaler.transform(X_test[:,i].reshape(-1, 1))[:,0]

# Metrics to use for this dataset

In [None]:
def score(y_test, y_pred, cat_number):
    score = 1-np.sum(np.abs(y_test-y_pred))/((cat_number-1)*len(y_test))
    return score
    

# Classification models to try :

- Decision trees
- K nearest neighbor
- PCA
- Random Forest


# Decision Trees

In [None]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
tree = DecisionTreeClassifier(criterion='entropy')
tree.fit(X_train, y_train)


In [None]:
y_train.shape

In [None]:
y_train

In [None]:
y_pred = tree.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)

In [None]:
acc

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)

In [None]:
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

In [None]:
print(score(y_test, y_pred,11))

In [None]:
kf = KFold(5, shuffle=True)
fold = 1

for train_index, validate_index in kf.split(X,y):
    tree.fit(X[train_index], y[train_index])
    y_test = y[validate_index]
    y_pred = tree.predict(X[validate_index])
    print(f'Accuracy : %.4f' % accuracy_score(y_test, y_pred))
    print(f'Score : %.4f' % score(y_test, y_pred,11))

### Adding PCA


In [None]:
from sklearn.decomposition import PCA
pca = PCA(5)

Xp = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(Xp, y, test_size=0.25)
for i in range(X_train.shape[1]) :
    X_train[:,i]= scaler.fit_transform(X_train[:,i].reshape(-1, 1))[:,0]
    X_test[:,i] = scaler.transform(X_test[:,i].reshape(-1, 1))[:,0]



In [None]:
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(score(y_test, y_pred,11))

# SVM

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear', C=1E10)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
for i in range(X_train.shape[1]) :
    X_train[:,i]= scaler.fit_transform(X_train[:,i].reshape(-1, 1))[:,0]
    X_test[:,i] = scaler.transform(X_test[:,i].reshape(-1, 1))[:,0]
    
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print(f'Accuracy : %.3f' % accuracy_score(y_test, y_pred))
print(f'Score : %.2f' % score(y_test, y_pred,11))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.show()

# Kmeans

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=11)
kmeans.fit(X_train)

In [None]:
y_pred = kmeans.predict(X_test)

In [None]:
print(f'Accuracy : %.3f' % accuracy_score(y_test, y_pred))
print(f'Score : %.2f' % score(y_test, y_pred,11))

In [None]:
from sklearn.cluster import SpectralClustering
kmeans = SpectralClustering(n_clusters=11, affinity='nearest_neighbors', assign_labels='kmeans')
kmeans.fit(X_train)