In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Reading and investigating the data

In [3]:
#data = pd.read_csv('bankdata.csv')

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.head()

### Investigating the numerical variables

In [None]:
data.describe()

#### Investigating variable duration

In [None]:
data['duration'].value_counts()

In [None]:
# we will convert this to a categorical
data['duration'] = data['duration'].apply(str)

In [None]:
# Checking for multicollinearity
plt.figure(figsize=(10, 8))
ax = sns.heatmap(data.corr(), annot=True)
plt.show()

payments and amount_left have a rather high correlation, so we will include only one of them in the model

In [None]:
#sns.distplot(data['t_amount'])
#plt.show()

sns.distplot(data['balance'])
plt.show()

#sns.distplot(data['payments'])
#plt.show()

### Preprocessing the numerical variables

In [None]:
# we standrdise our data
from sklearn.preprocessing import StandardScaler
data_num = data.select_dtypes(include = np.number).drop(columns = 'amount_left')
print(data_num.head())
# Normalizing data
transformer = StandardScaler().fit(data_num)
data_num_standardized = transformer.transform(data_num)
x = pd.DataFrame(data_num_standardized, columns = data_num.columns)
print(x.head())

In [None]:
sns.distplot(data['balance'])
plt.show()

In [None]:
sns.distplot(x['balance'])
plt.show()

### Getting dummies for the categorical variables

In [None]:
# we apply onehotencoding to our categorical variables, so we get dummy variables for each variable.
# "status" is our DV
from sklearn.preprocessing import OneHotEncoder
cat = data.select_dtypes(include = np.object).drop(columns = 'status')
enc = OneHotEncoder()
categorical = pd.DataFrame(enc.fit_transform(cat).toarray(), columns = enc.get_feature_names())
categorical.head()

In [None]:
# just to check if we did everythin correct
categorical.sum(axis = 1)

In [None]:
# to concat the dummies with our numerical variables
y = data['status']
X = pd.concat((x, categorical), axis=1)

#### Splitting into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)
# random_state selects randomly the train and test.
# with time series data, we subset our data.

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticr = LogisticRegression(max_iter = 500).fit(X_train, y_train)



In [None]:
predictions = logisticr.predict(X_test)
logisticr.score(X_test, y_test)
# the score is accuracy, i.e. 88% of our data are classified correctly. 

In [None]:
print(y_test.value_counts())
# y_test

In [None]:
pd.Series(predictions).value_counts()
# more likely 

What happened in terms of the classes?

### Decision Tree

In [None]:
# we are able to deal with nonlinearity with decision tree.
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
predictions = decisiontree.predict(X_test)
decisiontree.score(X_test, y_test)

In [None]:
pd.Series(predictions).value_counts()

In [None]:
# is similar with p-value,
# which are the most significant features , it is called feature importance
# the higher the value mean it is more important.
decisiontree.feature_importances_

In [None]:
print(dict(zip(X.columns, decisiontree.feature_importances_)))

In [None]:
plt.rcParams['figure.figsize'] = [25, 6]
plt.bar(height = decisiontree.feature_importances_, x = X.columns)
plt.show()