### Importing The Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

### Loading the data from csv file to a Pandas DataFrame

In [None]:
df = pd.read_csv("D:/New folder/Suzlon/ProstheticSystemsDataSet.csv")

In [None]:
df.info()

## Performing EDA

In [None]:
df.head()

### Finding Null values

In [None]:
df.isnull().sum()

In [None]:
df['Family members'].fillna(df['Family members'].mode()[0],inplace=True)
df.isnull().sum()

In [None]:
df[df.duplicated()]

### Renaming the coloums

In [None]:
df = df.rename(columns={'Age (in years)':'Age','Experience (in years)':'Experience','Income (in K/month)':'Income','ZIP Code':'ZIP_Code','Family members':'Family_members','Personal Loan':'Personal_Loan','Securities Account':'Securities_Account','CD Account':'CD_Account'})
df.head()

### Performing the Univariate Analysis

In [None]:
plt.style.use('fivethirtyeight')
ax = sns.countplot(x='Personal_Loan', data=df,hue='Personal_Loan')

In [None]:
df['Personal_Loan'].value_counts()

In [None]:
df['Education'].value_counts(normalize=True).plot(kind='bar',title= 'Education')

In [None]:
sns.distplot(df['Age'],color='red')

In [None]:
sns.distplot(df['Experience'],color='red')

In [None]:
df['Family_members'].value_counts(normalize=True).plot(kind='bar',title= 'Family_members')

In [None]:
fig, axes = plt.subplots(3,2, figsize=(20, 20))
sns.distplot(df['Income'],ax=axes[0,0],color='orange')
sns.boxplot(df['Income'],data=df,ax=axes[0,1])
sns.distplot(df['CCAvg'],ax=axes[1,0],color='blue')
sns.boxplot(df['CCAvg'],ax=axes[1,1],color='red')
sns.distplot(df['Mortgage'],ax=axes[2,0],color='green')
sns.boxplot(df['Mortgage'],ax=axes[2,1],color='red')

In [None]:
df['CreditCard'].value_counts(normalize=True).plot(kind='bar',title= 'CreditCard')


In [None]:
df['Online'].value_counts(normalize=True).plot(kind='bar',title= 'Online')

In [None]:
df['Securities_Account'].value_counts(normalize=True).plot(kind='bar',title= 'Securities_Account')

In [None]:
df['CD_Account'].value_counts(normalize=True).plot(kind='bar',title= 'CD_Account')

### Pereforming Bivariate Ananlysis

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.Education, df.Personal_Loan)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Personal_Loan', fontsize=10)
plt.xlabel('Education')
plt.ylabel('Personal_Loan')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.Securities_Account ,df.Personal_Loan)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Securities_Account vs Personal_Loan', fontsize=10)
plt.xlabel('Securities_Account')
plt.ylabel('Personal_Loan')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.CD_Account, df.Personal_Loan)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of CD_Account vs Personal_Loan', fontsize=10)
plt.xlabel('CD_Account')
plt.ylabel('Personal_Loan')
plt.show()


In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.Online, df.Personal_Loan)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Online vs Personal_Loan', fontsize=10)
plt.xlabel('Online')
plt.ylabel('Personal_Loan')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.CreditCard, df.Personal_Loan)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of CreditCard vs Personal_Loan', fontsize=10)
plt.xlabel('CreditCard')
plt.ylabel('Personal_Loan')
plt.show()


## Outlier Detection using IsolationForest

In [None]:
clf = IsolationForest(random_state=50,contamination=.01)
clf.fit(df)

### Predicting outliers

In [None]:
y_pred_outliers = clf.predict(df)

In [None]:

df['anomaly']=clf.predict(df.iloc[:,0:15])

In [None]:
df

In [None]:
df[df['anomaly']==-1]

### Removing Outliers

In [None]:
df.drop(df.index[df['anomaly']==-1])

### Splitting the DataSet

In [None]:
X = df.drop(['Personal_Loan'], axis = 1)

y = df['Personal_Loan']

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1210)

## Balancing The Data

In [None]:
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)

# check the balanced data
print(pd.Series(y_rus).value_counts())

# plot the new distribution
fig, ax = plt.subplots(figsize=(7, 4))
ax = sns.countplot(y_rus)
plt.tight_layout()

In [None]:
from yellowbrick.features import Rank1D
visualizer = Rank1D(algorithm='shapiro')
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.show()


## Plotting Correlation HeatMap

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=False)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

### Removing unwanted coloums

In [None]:
df = df.drop(['ID','ZIP_Code','Family_members','Online'],axis=1)

In [None]:
df = df.drop(['anomaly'],axis=1)


In [None]:
df

# Model building
# 1. Decission Tree

In [None]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import preprocessing 
from sklearn import metrics

In [None]:
x = df.drop(['Personal_Loan'], axis = 1)

In [None]:
y = df['Personal_Loan']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.2,random_state=101)

In [None]:
sk = StratifiedKFold(n_splits = 10, shuffle=True, random_state = 101)

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
smk = SMOTETomek(random_state=101)
X1_res,y1_res = smk.fit_resample(x,y)
x.shape,y.shape,X1_res.shape,y1_res.shape

In [None]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X1_res,y1_res,test_size=0.2,random_state=100)

In [None]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtmodel1 = DecisionTreeClassifier(random_state=101)
a=[3,4,5,6,7,8,9,10]
parameter = [{'max_depth':a,'max_leaf_nodes':a,'criterion':['entropy','gini'],'min_samples_split':a,'min_samples_leaf':a}]
gridparam = GridSearchCV(dtmodel1,parameter)
gridparam.fit(X1_res,y1_res)

In [None]:
gridparam.best_params_

In [None]:
model = DecisionTreeClassifier(random_state=101,max_depth=6,criterion='gini',max_leaf_nodes=9,min_samples_split=3,min_samples_leaf=3)
model.fit(X1_res,y1_res)
result= cross_val_score(model,X1_res,y1_res,cv=sk)
result.mean()

In [None]:
model_1= DecisionTreeClassifier(random_state=101,max_depth=6,criterion='gini',max_leaf_nodes=9,min_samples_split=3,min_samples_leaf=3)
model_1.fit(X1_train,y1_train)
result1 = model_1.score(X1_train,y1_train)
result2 = model_1.score(X1_val,y1_val)
result1,result2

In [None]:
print(classification_report(y1_val,model_1.predict(X1_val)))

In [None]:
roc_auc_score(y1_val, model_1.predict_proba(X1_val)[:, 1])

In [None]:
pd.crosstab(y1_val,model_1.predict(X1_val))

## 2.Random Forest

In [None]:
a = range(2,10)
parameter = [{'criterion':['gini','entropy'],'max_depth':a,'max_leaf_nodes':a,'min_samples_split':a,'max_features':['auto','sqrt','log2']}]
Rmodel = RandomForestClassifier()

In [None]:
gridRF = GridSearchCV(Rmodel,parameter)
gridRF.fit(X1_res,y1_res)
gridRF.best_params_

In [None]:
model2 = RandomForestClassifier(n_estimators=500,criterion='entropy', max_depth=8, max_leaf_nodes=9,random_state=101,min_samples_split=5)
model2.fit(X1_res,y1_res)
result3 = cross_val_score(model2,X1_res,y1_res,cv=sk)
result3.mean()

In [None]:
model2_1 = RandomForestClassifier(n_estimators=500,criterion='entropy', max_depth=8, max_leaf_nodes=9,random_state=101,min_samples_split=5)
model2_1.fit(X1_train,y1_train)
result4 = model2_1.score(X1_train,y1_train)
result5 = model2_1.score(X1_val,y1_val)
result4,result5

In [None]:
print(classification_report(y1_val,model2_1.predict(X1_val)))

In [None]:
roc_auc_score(y1_val, model2_1.predict_proba(X1_val)[:, 1])

In [None]:
pd.crosstab(y1_val,model2_1.predict(X1_val))