In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')
PALETTE='RdBu_r'

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_excel('../output/reduced_data.xlsx')

# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['Target'], axis=1),
    df['Target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((2081, 12), (892, 12))

##### Logistic Regression

In [67]:
scaler = StandardScaler()
lr = LogisticRegression(solver = 'lbfgs', random_state=42)
lr.fit(scaler.fit_transform(X_train), y_train)
y_pred = lr.predict(X_train)

print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.92      0.79      1373
           1       0.54      0.17      0.26       708

    accuracy                           0.67      2081
   macro avg       0.61      0.55      0.52      2081
weighted avg       0.63      0.67      0.61      2081

[[1270  103]
 [ 588  120]]


##### Random Forest Classifier

In [70]:
rf = RandomForestClassifier(criterion='gini',
                           n_estimators=10,
                           random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1373
           1       0.99      0.95      0.97       708

    accuracy                           0.98      2081
   macro avg       0.98      0.97      0.98      2081
weighted avg       0.98      0.98      0.98      2081



##### Support Vector Machine

In [71]:
from sklearn.svm import SVC

In [82]:
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(scaler.fit_transform(X_train), y_train)

y_pred = svm.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.92      0.79      1373
           1       0.54      0.17      0.26       708

    accuracy                           0.67      2081
   macro avg       0.61      0.55      0.52      2081
weighted avg       0.63      0.67      0.61      2081



##### KNN

In [75]:
from sklearn.neighbors import KNeighborsClassifier

In [79]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(scaler.fit_transform(X_train), y_train)
y_pred = knn.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.95      0.80      1373
           1       0.67      0.19      0.29       708

    accuracy                           0.69      2081
   macro avg       0.68      0.57      0.55      2081
weighted avg       0.69      0.69      0.63      2081



In [80]:
knn.score(X_train, y_train)

0.691975012013455

In [81]:
svm.score(X_train, y_train)

0.6674675636713119

In [83]:
df

Unnamed: 0,v2a1,rooms,escolari,overcrowding,SQBdependency,roof,escolari-min,escolari-max,escolari-sum,escolari-std,age-min,age-std,Target
0,190000,3,10,1.000000,0.000000,0,10,10,10,0.000000,43,0.000000,0
1,135000,4,12,1.000000,64.000000,1,12,12,12,0.000000,67,0.000000,0
2,0,8,11,0.500000,64.000000,2,11,11,11,0.000000,92,0.000000,0
3,180000,5,11,1.333333,1.000000,2,2,11,33,4.272002,8,14.899664,0
4,130000,2,9,4.000000,1.000000,0,0,11,23,5.123475,7,11.690452,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968,0,4,5,2.500000,2.250000,1,0,5,9,2.489980,6,11.489125,1
2969,0,3,2,2.500000,2.250000,0,0,6,17,2.607681,2,16.133815,1
2970,46500,5,2,2.333333,0.444444,1,0,11,30,5.049752,2,18.753666,0
2971,0,3,0,2.000000,1.000000,2,0,6,6,4.242641,61,4.242641,1
