Import necessary libraries.

In [1]:
import time
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

Load data from the input csv file.

In [2]:
df = pd.read_csv('../data/data_exercice.csv', header=0)
df.head()

Unnamed: 0,TARGET,F_NB_CONTRACT_PL,F_NB_PREVIOUS_PL,F_DELAY_LAST_PL,F_DELAY_FIRST_PL,F_TODU_B2C_SUM,F_TODU_B2B_SUM,F_TODU_BANKING PARTNER_SUM,F_TODU_BROKER_SUM,F_TODU_AUTOMOTIVE_SUM,...,F_DELAY_LAST_RC,F_DELAY_FIRST_RC,EDUCATION,MARITAL_STATUS,MONTHLY_INCOME,DWELLING,AGE,JOB_TITLE,SCO_CRIF_B2B_LOAN,SCO_CRIF_B2C_LOAN
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,university degree,married,3139.0,owner,67.0,pensioner,3.0,2.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,university degree,married,3139.0,owner,68.0,pensioner,3.0,2.0
2,0,1.0,1.0,186.0,186.0,2085.04,0.0,0.0,0.0,0.0,...,66.0,21.0,unknown,married,1510.0,owner with mortage,59.0,labourer,2.0,3.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,unknown,married,3150.0,owner with mortage,59.0,teacher,3.0,2.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,unknown,married,3150.0,owner with mortage,59.0,teacher,3.0,2.0


Use Pandas *load_dummies* to transform category columns into indicators.

In [3]:
data = pd.get_dummies(df)
data.head()
headers = list(data.columns.values)

Get the labels for the target column (*'TARGET'*) and for the training data (*all other columns*).

Use *train_test_split* to randomly select 35% of the available data to be use for training, leaving 75% of the data for checking the accuracy of the classifier.

Missing values in both the training and test data will be filled using the mean of the corresponding column in the training data.

In [4]:
y = data['TARGET']
X = data[[label for label in headers if label != 'TARGET']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())

Initialize the RandomForestClassifier (150 trees with a max tree depth of 25) and train it using the selected training data.

In [5]:
st = time.time()
classifier = RandomForestClassifier(
    n_estimators=150,
    max_depth=25,
)

classifier.fit(X_train, y_train)

print(f'Time spent training: {round(time.time() - st, 2)}s')

Time spent training: 25.19s


Use the classifier to predict the TARGET for the test data.

Evaluate the accuracy of the prediction using *metrics.accuracy_score* to compare the actual and predicted TARGET values.


In [6]:
y_pred = classifier.predict(X_test)

print(f'RFC Accuracy: {metrics.accuracy_score(y_test, y_pred)}')

from collections import Counter
expected_cnt = Counter(y_test)
print(f'Expected {expected_cnt[1]}/{len(y_test)} 1s')
predicted_cnt = Counter(y_pred)
print(f'Got {predicted_cnt[1]}/{len(y_pred)} 1s')

print(f'"Just guess 0" Accuracy: {metrics.accuracy_score(y_test, [0]*len(y_pred))}')

RFC Accuracy: 0.9691134234191817
Expected 2605/84956 1s
Got 19/84956 1s
"Just guess 0" Accuracy: 0.9693370686002166


Get the prediction probability values and use them with the expected TARGET values to compute the *Area Under the ROC Curve*.

In [7]:
probs = classifier.predict_proba(X_test)
auc = metrics.roc_auc_score(y_test, probs[:, 1])
print(f'ROC AUC: {auc}')



ROC AUC: 0.7352171528496148
