## Loading libraries

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.datasets import load_boston

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
print(targets['TARGET_B'].value_counts())

0    90569
1     4843
Name: TARGET_B, dtype: int64


As we can see there is a huge imbalance in the data in the representation of the two categories. Category 1 is represented 4843 times and category 0 is represented 90569 times. 


## Downsampling

In [3]:
data = pd.concat([numerical,categorical,targets],axis=1)

category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

# We pick a random sample of rows from of observations belonging to "category_0"
# in the same amount of observations belonging to "category_1"
category_0 = category_0.sample(len(category_1))
print(category_0.shape)
print(category_1.shape)

data2 = pd.concat([category_0, category_1], axis=0)
#shuffling the data
data2 = data2.sample(frac=1)
data2['TARGET_B'].value_counts()

(4843, 339)
(4843, 339)


1    4843
0    4843
Name: TARGET_B, dtype: int64

# Upsampling

In [4]:
data = data.drop(['TARGET_D'], axis=1)
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

# Upsampling 
# As we are going to repeat observations, the random samples can be picked more then once,
# threfore we need to use the keyword: replace=True
category_1 = category_1.sample(len(category_0), replace=True)  
print(category_1.shape)

data3 = pd.concat([category_0, category_1], axis=0)
#shuffling the data
data3 = data3.sample(frac=1)
print(data3['TARGET_B'].value_counts())

(90569, 338)
1    90569
0    90569
Name: TARGET_B, dtype: int64


## SMOTE

In [5]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categorcalX = X.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(categorcalX)
encoded_categorical = encoder.transform(categorcalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [6]:
from imblearn.over_sampling import SMOTE

# SMOTE only can handle numerical features. Therefore, if we want to use categoricals they 
# will need to be dummyfied. However, for sake of simplicity we will not include the categoricals.

data = pd.concat([numerical,targets],axis=1)
smote = SMOTE()
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
X_sm, y_sm = smote.fit_sample(X, y)

unique, counts = np.unique(y_sm, return_counts=True)
print(np.asarray((unique, counts)).T)

[[    0 90569]
 [    1 90569]]


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)

In [8]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features 
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [9]:
X_train.shape

(135853, 315)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from scipy.stats import t, norm

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print("The accuracy of the Random forest is: {:4.2f}".format(clf.score(X_test, y_test)))
print()

alpha = 0.05
K = 10
# For cross validation
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=K)

if (K < 30):
    t_critical = abs(t.ppf(1-alpha/2, K-1))
    interval = t_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K))
else:
    z_critical = abs(norm.ppf(1-alpha/2))
    interval = z_critical*(np.std(cross_val_score(clf, X_train, y_train, cv=10))/np.sqrt(K)) 
print("The accuracy of the Random Forest model (CV witk K={}) is: {:4.2f} +/- {:4.2f}".format(K,np.mean(cross_val_scores),interval))

The accuracy of the Random forest is: 0.76

The accuracy of the Random Forest model (CV witk K=10) is: 0.76 +/- 0.00
