## Testing the new cluster generated by Hannah (10)

In [1]:
!pip install imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline



### Data Preparation

In [2]:
#All import go here
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

#Get and describe data
path = 'temp.csv'
df = pd.read_csv(path)


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pickle

########################################
##Temperoroy dropping of some variables
######################################

df1 = df.drop(['Sexual Orientation','D1','J1','Medical Issue','Nationality','Gender','Marital Status','Intrptr Reqd','L1','Immigration Status'],1)

le = LabelEncoder()
df1['Result'] = le.fit_transform(df1['Result'].values)
print(df1['Result'].value_counts())
#Separate training features from target
X = df1.drop(['Result'], axis=1)
y = df1['Result']

####### 
#Train dataset = X
#Test Dataset = y
########
  
# Convert the following numerical labels from interger to float
ss = StandardScaler()
X[['Session per week','TotalDisabilty','TotalMentalHealth','Time per Session']] = ss.fit_transform(X[['Session per week','TotalDisabilty','TotalMentalHealth','Time per Session']])
output = open('StandardScaler.pkl', 'wb')
pickle.dump(ss, output)
output.close()

# categorical features to be converted to One Hot Encoding
#'Intrptr Reqd','Full Assessment','EET status'
categ = [ 'Acc Type prev', 'B1', 'C1','Economic Status', 'Area', 'Scheme','EET status','Service Type','Religion',]
# One hot encoding the features with more than two categories
ohe = OneHotEncoder()
X_object = X[categ]
X = X.drop(categ,1)
ohe.fit(X_object)
codes = ohe.transform(X_object).toarray()
feature_names = ohe.get_feature_names(categ)
X = pd.concat([X,  pd.DataFrame(codes,columns=feature_names).astype(int)], axis=1)

output = open('OneHotEncoder.pkl', 'wb')
pickle.dump(ohe, output)
output.close()

1    1739
0     485
Name: Result, dtype: int64


In [4]:
for c in df1.columns:
    if(c in ['TotalDisabilty','TotalMentalHealth','Time per Session','Session per week']):
        continue
    print("{} column unique values {}".format(c, df1[c].unique()))
    print("\n")

Results diplayed here !!


### Train Test Split

In [5]:
#Now split the data into train and test Split 
seed = 999
X_Train, X_val, y_Train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state=seed)

In [8]:
print("Train and Tets data are :")
print(X_Train.shape)
print(X_val.shape)
print("Teh ditribution of teh train data is :")
print(y_Train.value_counts())
print("Teh ditribution of teh test data is :")
print(y_val.value_counts())

Train and Tets data are :
(1779, 72)
(445, 72)
Teh ditribution of teh train data is :
1    1391
0     388
Name: Result, dtype: int64
Teh ditribution of teh test data is :
1    348
0     97
Name: Result, dtype: int64


In [9]:
# Borderline SMOTE with SVM
from imblearn.over_sampling import SVMSMOTE

oversample = SVMSMOTE()
X1, y1 = oversample.fit_resample(X_Train, y_Train)
y1.value_counts()

1    1391
0    1391
Name: Result, dtype: int64

### Fit the model on Training data

In [28]:
###################################################
### Finally training the model with best params ###
###################################################

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

best_model = RandomForestClassifier(criterion='log_loss',max_depth = 6, n_estimators=80)
best_model = best_model.fit(X1,y1)

# Do the Threshold tuning and check for the possible result we can get at any threshold
yhat = best_model.predict_proba(X_val)
probs = yhat[:,1]
# define thresholds
thresholds = np.arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score(y_val, to_labels(probs, t), average='macro') for t in thresholds]

# get best threshold
ix = np.argmax(scores)

max_score_on_val = scores[ix]
#final_model_RF = model
y_pred = np.zeros((len(X_val),1), np.uint8)
for j in range(len(X_val)):
    y_pred[j] = to_labels(probs[j], thresholds[ix])
confusion = confusion_matrix(y_val, y_pred)
print("After Threshold Tuning, results are:\n")
print('F1:%.3f Threshold=%.4f' % (scores[ix],thresholds[ix]))
print("\n\n")
print("The confusion matrix for maxium score is :")
print(confusion)
print(classification_report(y_val, y_pred))

After Threshold Tuning, results are:

F1:0.593 Threshold=0.44



The confusion matrix for maxium score is :
[[ 31  66]
 [ 50 298]]
              precision    recall  f1-score   support

           0       0.38      0.32      0.35        97
           1       0.82      0.86      0.84       348

    accuracy                           0.74       445
   macro avg       0.60      0.59      0.59       445
weighted avg       0.72      0.74      0.73       445



In [29]:
## Saving the model to the pickle file 
import pickle

# Optimal Thresh is 0.44
output = open('model_final.pkl', 'wb')
pickle.dump(best_model, output)
output.close()

In [1]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

fig = plt.figure(figsize=(15, 10))
data = plot_tree(best_model.estimators_[5], 
          feature_names=X1.columns,
          class_names=['0','1'], 
          filled=True, impurity=True, 
          rounded=True)
#fig.savefig('2224_mod_ADAsyn_RF.png', dpi=800)

Tree here !


## Feature Importance

In [216]:
importances_unclustered = best_model.feature_importances_

In [6]:
print("Total features :")
print(len(best_model.feature_names_in_))
print('\n')
best_model.feature_names_in_

Results here 


In [218]:
############for the Data 2224

columns = ['TotalDisabilty', 'TotalMentalHealth', 'Time per Session',
       'Session per week','Acc Type prev','B1', 'C1','Economic Status',
       'Area', 'Scheme', 'EET status', 'Service Type', 'Religion']

############for the Data 2224

one_categ_columns = ['Medical Issue','D1','J1','L1','TotalDisabilty',
       'TotalMentalHealth', 'Time per Session','Session per week']

pointer = 0
importances = []
for cat in columns:
    if(cat in one_categ_columns):
        values = 1
    else:
        values = len(df1[cat].unique())
    importance = sum(importances_unclustered[pointer:(pointer+values)])
    importances.append(importance)
    pointer=pointer+values

In [219]:
print("Total features detected are {} with feature importance sum {}".format(pointer, sum(importances)))

Total features detected are 72 with feature importance sum 0.9999999999999999


In [2]:
from matplotlib import pyplot as plt
forest_importances = pd.Series(importances, index=columns)
forest_importances = forest_importances.sort_values()
fig, ax = plt.subplots()
forest_importances.plot.barh(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

plot_width, plot_height = (20,14)
plt.rcParams['figure.figsize'] = (plot_width,plot_height)
plt.rcParams['font.size']=22

Feature Importances
