In [135]:
import pandas as pd
from pandas import get_dummies

import numpy as np

from imblearn.over_sampling import SMOTE 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [136]:
df = pd.read_csv("train.csv")
train_labels = df['Risk']
train_data = df.drop('Risk',axis = 'columns')
train_data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,47,male,2,free,,little,8335,36,car
1,38,male,2,own,little,,804,12,radio/TV
2,28,male,2,own,little,little,5371,36,furniture/equipment
3,29,female,0,own,,moderate,3990,36,domestic appliances
4,24,female,2,own,,moderate,8487,48,car


In [137]:
test_df = pd.read_csv("test.csv")
test_labels = test_df['Risk']
test_data = test_df.drop('Risk', axis='columns')
test_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,36,male,2,own,rich,moderate,1913,18,business
1,34,male,3,own,little,moderate,1860,12,car
2,48,male,2,own,little,little,1024,24,radio/TV
3,31,male,2,own,little,little,3104,18,business
4,23,male,1,own,quite rich,moderate,2520,27,radio/TV
...,...,...,...,...,...,...,...,...,...
195,53,male,2,free,little,little,7119,48,furniture/equipment
196,42,male,2,own,,moderate,2427,18,business
197,25,male,2,own,little,,1262,12,radio/TV
198,31,male,2,own,,little,6350,30,furniture/equipment


In [138]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               800 non-null    int64 
 1   Sex               800 non-null    object
 2   Job               800 non-null    int64 
 3   Housing           800 non-null    object
 4   Saving accounts   656 non-null    object
 5   Checking account  484 non-null    object
 6   Credit amount     800 non-null    int64 
 7   Duration          800 non-null    int64 
 8   Purpose           800 non-null    object
dtypes: int64(4), object(5)
memory usage: 56.4+ KB


In [139]:
missing_value_cols = ['Saving accounts','Checking account']

In [140]:
train_data[missing_value_cols] = train_data[missing_value_cols].replace(np.nan,'NONE')
test_data[missing_value_cols] = test_data[missing_value_cols].replace(np.nan,'NONE')

In [141]:
train_data.dtypes

Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
dtype: object

In [142]:
# for value in train_data['Credit amount']:
#     if value > 10000: value = 10000


# train_data['Credit amount'] = train_data['Credit amount'].map(lambda x: 10000 if x > 10000 else x)
# train_data['Credit amount'] = np.log(train_data['Credit amount']+1)

# print(train_data['Credit amount'].unique())

In [143]:
numeric_cols = ['Job','Credit amount','Duration']
categorical_cols = ['Sex','Housing','Saving accounts','Checking account','Purpose', 'Age']

In [144]:
def age_transform(value):
    if value <= 25: value = 1
    elif value <= 40 : value = 2
    elif value <50: value = 3
    elif value <80: value = 4
    else:
        value = 0
    return value

train_data['Age'] = train_data['Age'].map(age_transform)
test_data['Age'] = test_data['Age'].map(age_transform)



# def job_transform(value):
#     if value < 1 : value = 'No rent'
#     elif value < 2 : value = 'None skill - No rent'
#     elif value < 3: value = 'Skill - Rent'
#     elif value < 4 : value = 'High skill'
#     else: value = 'None'
#     return value

# train_data['Job'] = train_data['Job'].map(job_transform)
# test_data['Job'] = test_data['Job'].map(job_transform)


In [145]:
for col in categorical_cols:
    print(col,train_data[col].unique())

Sex ['male' 'female']
Housing ['free' 'own' 'rent']
Saving accounts ['NONE' 'little' 'moderate' 'rich' 'quite rich']
Checking account ['little' 'NONE' 'moderate' 'rich']
Purpose ['car' 'radio/TV' 'furniture/equipment' 'domestic appliances' 'education'
 'repairs' 'vacation/others' 'business']
Age [3 2 1 4]


In [146]:
train_data['Housing'] = train_data['Housing'].apply(lambda x: 0 if x == 'free' else (2 if x == 'own' else 1))
train_data['Saving accounts'] = train_data['Saving accounts'].apply(lambda x: 2 if x == 'nan' 
                                                                    else(0 if x == 'little' 
                                                                    else(1 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))
                                                                    
train_data['Checking account'] = train_data['Checking account'].apply(lambda x: 2 if x == 'nan' 
                                                                    else(0 if x == 'little' 
                                                                    else(1 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))

In [147]:
test_data['Housing'] = test_data['Housing'].apply(lambda x: 0 if x == 'free' else (2 if x == 'own' else 1))
test_data['Saving accounts'] = test_data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))
                                                                    
test_data['Checking account'] = test_data['Checking account'].apply(lambda x: 0 if x == 'nan' 
                                                                    else(1 if x == 'little' 
                                                                    else(2 if x == 'moderate'
                                                                    else(3 if x == 'quite rich' 
                                                                    else 4))))

In [148]:
# train_data['Purpose'] = train_data['Purpose'].apply(lambda x: 0 if x == 'education' 
#                                                                     else(2 if x == 'radio/TV'                                                             
#                                                                     else 1))
# test_data['Purpose'] = test_data['Purpose'].apply(lambda x: 0 if x == 'education' 
#                                                                 else(2 if x == 'radio/TV'                                                             
#                                                                 else 1))

In [149]:
# train_data['Sex'] = train_data['Sex'].apply(lambda x: 0 if x == 'female' else 1)
# test_data['Sex'] = test_data['Sex'].apply(lambda x: 0 if x == 'female' else 1)


In [150]:
numeric_cols = ['Age','Job','Credit amount','Duration','Housing','Saving accounts','Checking account']
categorical_cols = ['Sex','Purpose']

In [151]:
print(np.min(train_data['Credit amount']),np.max(train_data['Credit amount']))
print(np.min(test_data['Credit amount']),np.max(test_data['Credit amount']))

276 15945
250 18424


In [152]:
scaler = MinMaxScaler().fit(train_data[numeric_cols])
train_data[numeric_cols] = scaler.transform(train_data[numeric_cols])
train_data = get_dummies(train_data, categorical_cols)
# train_data, train_labels = SMOTE().fit_resample(train_data, train_labels)
train_data

Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Sex_female,Sex_male,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.666667,0.666667,0.0,1.00,0.00,0.514328,0.571429,0,1,0,1,0,0,0,0,0,0
1,0.333333,0.666667,1.0,0.00,1.00,0.033697,0.142857,0,1,0,0,0,0,0,1,0,0
2,0.333333,0.666667,1.0,0.00,0.00,0.325164,0.571429,0,1,0,0,0,0,1,0,0,0
3,0.333333,0.000000,1.0,1.00,0.25,0.237029,0.571429,1,0,0,0,1,0,0,0,0,0
4,0.000000,0.666667,1.0,1.00,0.25,0.524028,0.785714,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.333333,0.666667,1.0,0.25,1.00,0.061331,0.035714,1,0,0,0,0,0,0,1,0,0
796,0.666667,0.666667,0.0,0.00,0.00,0.067458,0.357143,0,1,0,1,0,0,0,0,0,0
797,0.666667,0.666667,1.0,0.00,1.00,0.142894,0.250000,0,1,0,0,0,0,1,0,0,0
798,0.000000,0.666667,0.5,1.00,0.00,0.523007,0.250000,1,0,0,0,0,1,0,0,0,0


In [153]:
for col in train_data.columns:
    print(col,train_data[col].unique())

Age [0.66666667 0.33333333 0.         1.        ]
Job [0.66666667 0.         1.         0.33333333]
Housing [0.  1.  0.5]
Saving accounts [1.   0.   0.25 0.75]
Checking account [0.   1.   0.25]
Credit amount [0.51432765 0.03369711 0.32516434 0.23702853 0.52402834 0.13210798
 0.39517519 0.02712362 0.75901461 0.03573936 0.14691429 0.25355798
 0.22713638 0.0723722  0.21532963 0.07900951 0.14870126 0.09387963
 0.12604506 0.09853852 0.04926926 0.42906376 0.46627098 0.12936371
 0.22254132 0.06394792 0.10517582 0.1308954  0.20633097 0.1359372
 0.16937903 0.16184824 0.07486119 0.04167464 0.13127832 0.08430659
 0.11691876 0.06126747 0.13491608 0.06075691 0.21150041 0.11143021
 0.02648542 0.03765397 0.09585806 0.28942498 0.10581403 0.05207735
 0.10249537 0.15852958 0.13453315 0.03605846 0.22515796 0.10779246
 0.11774842 0.20690535 0.2712362  0.10096369 0.05667241 0.08162614
 0.1538707  0.23211437 0.74427213 0.03905801 0.07913715 0.40155721
 0.51758249 0.06369264 0.43276533 0.08022209 0.10626077 

In [154]:

test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])
test_data = get_dummies(test_data, categorical_cols)
test_data


Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Sex_female,Sex_male,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.333333,0.666667,1.0,1.00,0.50,0.104474,0.250000,0,1,1,0,0,0,0,0,0,0
1,0.333333,1.000000,1.0,0.25,0.50,0.101091,0.142857,0,1,0,1,0,0,0,0,0,0
2,0.666667,0.666667,1.0,0.25,0.25,0.047738,0.357143,0,1,0,0,0,0,0,1,0,0
3,0.333333,0.666667,1.0,0.25,0.25,0.180484,0.250000,0,1,1,0,0,0,0,0,0,0
4,0.000000,0.333333,1.0,0.75,0.50,0.143213,0.410714,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.000000,0.666667,0.0,0.25,0.25,0.436722,0.785714,0,1,0,0,0,0,1,0,0,0
196,0.666667,0.666667,1.0,1.00,0.50,0.137277,0.250000,0,1,1,0,0,0,0,0,0,0
197,0.000000,0.666667,1.0,0.25,1.00,0.062927,0.142857,0,1,0,0,0,0,0,1,0,0
198,0.333333,0.666667,1.0,1.00,0.25,0.387644,0.464286,0,1,0,0,0,0,1,0,0,0


In [155]:
# feat_weight = np.full(17,1.0)
# feat_weight[0:7] = [1,1,1,2,2,2,1.5]
# feat_weight[7:9] = [0.5,0.5]
# feat_weight[9:17] = [1/3]*8
# feat_weight

In [156]:
# i = 0
# for col in train_data.columns:
#     train_data[col] =  train_data[col]*feat_weight[i]
#     test_data[col] =  test_data[col]*feat_weight[i]
#     i+=1

In [157]:
train_data

Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Sex_female,Sex_male,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.666667,0.666667,0.0,1.00,0.00,0.514328,0.571429,0,1,0,1,0,0,0,0,0,0
1,0.333333,0.666667,1.0,0.00,1.00,0.033697,0.142857,0,1,0,0,0,0,0,1,0,0
2,0.333333,0.666667,1.0,0.00,0.00,0.325164,0.571429,0,1,0,0,0,0,1,0,0,0
3,0.333333,0.000000,1.0,1.00,0.25,0.237029,0.571429,1,0,0,0,1,0,0,0,0,0
4,0.000000,0.666667,1.0,1.00,0.25,0.524028,0.785714,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.333333,0.666667,1.0,0.25,1.00,0.061331,0.035714,1,0,0,0,0,0,0,1,0,0
796,0.666667,0.666667,0.0,0.00,0.00,0.067458,0.357143,0,1,0,1,0,0,0,0,0,0
797,0.666667,0.666667,1.0,0.00,1.00,0.142894,0.250000,0,1,0,0,0,0,1,0,0,0
798,0.000000,0.666667,0.5,1.00,0.00,0.523007,0.250000,1,0,0,0,0,1,0,0,0,0


In [158]:
test_data

Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Sex_female,Sex_male,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.333333,0.666667,1.0,1.00,0.50,0.104474,0.250000,0,1,1,0,0,0,0,0,0,0
1,0.333333,1.000000,1.0,0.25,0.50,0.101091,0.142857,0,1,0,1,0,0,0,0,0,0
2,0.666667,0.666667,1.0,0.25,0.25,0.047738,0.357143,0,1,0,0,0,0,0,1,0,0
3,0.333333,0.666667,1.0,0.25,0.25,0.180484,0.250000,0,1,1,0,0,0,0,0,0,0
4,0.000000,0.333333,1.0,0.75,0.50,0.143213,0.410714,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.000000,0.666667,0.0,0.25,0.25,0.436722,0.785714,0,1,0,0,0,0,1,0,0,0
196,0.666667,0.666667,1.0,1.00,0.50,0.137277,0.250000,0,1,1,0,0,0,0,0,0,0
197,0.000000,0.666667,1.0,0.25,1.00,0.062927,0.142857,0,1,0,0,0,0,0,1,0,0
198,0.333333,0.666667,1.0,1.00,0.25,0.387644,0.464286,0,1,0,0,0,0,1,0,0,0


In [159]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_data,train_labels)

In [160]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7 ,weights='distance')
knn.fit(train_data, train_labels)


In [161]:
pred1 = gnb.predict(test_data)
print(pred1,len(np.where(pred1==1)[0]))
from sklearn.metrics import classification_report
print(classification_report(y_pred=pred1, y_true = test_labels))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0
 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0
 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0] 46
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       140
           1       0.48      0.37      0.42        60

    accuracy                           0.69       200
   macro avg       0.62      0.60      0.60       200
weighted avg       0.67      0.69      0.68       200



In [162]:
pred = knn.predict(test_data)
print(pred,len(np.where(pred1==1)[0]))
from sklearn.metrics import classification_report
print(classification_report(y_pred=pred, y_true = test_labels))

[0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0] 46
              precision    recall  f1-score   support

           0       0.75      0.92      0.83       140
           1       0.61      0.28      0.39        60

    accuracy                           0.73       200
   macro avg       0.68      0.60      0.61       200
weighted avg       0.71      0.73      0.69       200



In [163]:
rf = RandomForestClassifier()
rf.fit(train_data,train_labels)
pred2 = rf.predict(test_data)
print(classification_report(y_pred=pred2, y_true = test_labels))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       140
           1       0.62      0.40      0.48        60

    accuracy                           0.74       200
   macro avg       0.70      0.65      0.66       200
weighted avg       0.73      0.74      0.73       200

