In [34]:
import pandas as pd
from pandas import get_dummies

import numpy as np

from imblearn.over_sampling import SMOTE 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [35]:
df = pd.read_csv("train.csv")
train_labels = df['Risk']
train_data = df.drop('Risk',axis = 'columns')
train_data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,47,male,2,free,,little,8335,36,car
1,38,male,2,own,little,,804,12,radio/TV
2,28,male,2,own,little,little,5371,36,furniture/equipment
3,29,female,0,own,,moderate,3990,36,domestic appliances
4,24,female,2,own,,moderate,8487,48,car


In [36]:
test_df = pd.read_csv("test.csv")
test_labels = test_df['Risk']
test_data = test_df.drop('Risk', axis='columns')
test_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,36,male,2,own,rich,moderate,1913,18,business
1,34,male,3,own,little,moderate,1860,12,car
2,48,male,2,own,little,little,1024,24,radio/TV
3,31,male,2,own,little,little,3104,18,business
4,23,male,1,own,quite rich,moderate,2520,27,radio/TV
...,...,...,...,...,...,...,...,...,...
195,53,male,2,free,little,little,7119,48,furniture/equipment
196,42,male,2,own,,moderate,2427,18,business
197,25,male,2,own,little,,1262,12,radio/TV
198,31,male,2,own,,little,6350,30,furniture/equipment


In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               800 non-null    int64 
 1   Sex               800 non-null    object
 2   Job               800 non-null    int64 
 3   Housing           800 non-null    object
 4   Saving accounts   656 non-null    object
 5   Checking account  484 non-null    object
 6   Credit amount     800 non-null    int64 
 7   Duration          800 non-null    int64 
 8   Purpose           800 non-null    object
dtypes: int64(4), object(5)
memory usage: 56.4+ KB


In [38]:
missing_value_cols = ['Saving accounts','Checking account']

In [39]:
train_data[missing_value_cols] = train_data[missing_value_cols].replace(np.nan,'NONE')
test_data[missing_value_cols] = test_data[missing_value_cols].replace(np.nan,'NONE')

In [40]:
train_data.dtypes

Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
dtype: object

In [41]:
# for value in train_data['Credit amount']:
#     if value > 10000: value = 10000


# train_data['Credit amount'] = train_data['Credit amount'].map(lambda x: 10000 if x > 10000 else x)
# train_data['Credit amount'] = np.log(train_data['Credit amount']+1)

# print(train_data['Credit amount'].unique())

In [42]:
def age_transform(value):
    if value <= 29: value = 'Young'
    elif value <= 40 : value = 'Young_Adult'
    elif value <55: value = 'Adult'
    elif value <80: value = 'Elder'
    else:
        value = 0
    return value

train_data['Age'] = train_data['Age'].map(age_transform)
test_data['Age'] = test_data['Age'].map(age_transform)

In [43]:
numeric_cols = ['Job','Credit amount','Duration']
categorical_cols = ['Sex','Housing','Saving accounts','Checking account','Purpose', 'Age']

In [44]:
import category_encoders as ce
encoder = ce.TargetEncoder(categorical_cols)
encoder.fit(X=train_data, y= train_labels)
train_data= encoder.transform(train_data)
test_data = encoder.transform(test_data)



In [45]:
train_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0.256579,0.274047,2,0.406977,0.166667,0.479452,8335,36,0.309353
1,0.261745,0.274047,2,0.259649,0.364211,0.120253,804,12,0.213953
2,0.352740,0.274047,2,0.259649,0.364211,0.479452,5371,36,0.328859
3,0.352740,0.357430,0,0.259649,0.166667,0.392523,3990,36,0.300000
4,0.352740,0.357430,2,0.259649,0.166667,0.392523,8487,48,0.309353
...,...,...,...,...,...,...,...,...,...
795,0.352740,0.357430,2,0.259649,0.333333,0.120253,1237,6,0.213953
796,0.256579,0.274047,2,0.406977,0.364211,0.479452,1333,24,0.309353
797,0.256579,0.274047,2,0.259649,0.364211,0.120253,2515,18,0.328859
798,0.352740,0.357430,2,0.395833,0.166667,0.479452,8471,18,0.466667


In [46]:
for col in train_data.columns:
    print(col, train_data[col].unique())

Age [0.25657895 0.26174497 0.35273973 0.34482759]
Sex [0.27404719 0.35742972]
Job [2 0 3 1]
Housing [0.40697674 0.25964912 0.39583333]
Saving accounts [0.16666667 0.36421053 0.33333333 0.12195122 0.16      ]
Checking account [0.47945205 0.12025316 0.39252336 0.25490196]
Credit amount [ 8335   804  5371  3990  8487  2346  6468   701 12169   836  2578  4249
  3835  1410  3650  1514  2606  1747  2251  1820  1048  6999  7582  2303
  3763  1278  1924  2327  3509  2406  2930  2812  1449   929  2333  1597
  2108  1236  2390  1228  3590  2022   691   866  1778  4811  1934  1092
  1882  2760  2384   841  3804  1965  2121  3518  4526  1858  1164  1555
  2687  3913 11938   888  1516  6568  8386  1274  7057  1533  1941  1872
  1393  4241 11560  4370  6289  2671  2576  1953  3349  1433  3973  2522
   585  1867  1471  4583  2439  4576  3447  1169  3832  1207  5965  1908
  5800  1543  1193  3863   601   741  2284   428  2235  3939  1264  1216
  1245  7238  3017  2762  9857  2210  1961 14318  4771   5

In [47]:

# def job_transform(value):
#     if value < 1 : value = 'No rent'
#     elif value < 2 : value = 'None skill - No rent'
#     elif value < 3: value = 'Skill - Rent'
#     elif value < 4 : value = 'High skill'
#     else: value = 'None'
#     return value

# train_data['Job'] = train_data['Job'].map(job_transform)
# test_data['Job'] = test_data['Job'].map(job_transform)


In [48]:
for col in categorical_cols:
    print(col,train_data[col].unique())

Sex [0.27404719 0.35742972]
Housing [0.40697674 0.25964912 0.39583333]
Saving accounts [0.16666667 0.36421053 0.33333333 0.12195122 0.16      ]
Checking account [0.47945205 0.12025316 0.39252336 0.25490196]
Purpose [0.30935252 0.21395349 0.32885906 0.3        0.46666667 0.38888889
 0.33311024 0.32911392]
Age [0.25657895 0.26174497 0.35273973 0.34482759]


In [49]:
# train_data['Housing'] = train_data['Housing'].apply(lambda x: 0 if x == 'free' else (2 if x == 'own' else 1))
# train_data['Saving accounts'] = train_data['Saving accounts'].apply(lambda x: 2 if x == 'nan' 
#                                                                     else(0 if x == 'little' 
#                                                                     else(1 if x == 'moderate'
#                                                                     else(3 if x == 'quite rich' 
#                                                                     else 4))))
                                                                    
# train_data['Checking account'] = train_data['Checking account'].apply(lambda x: 2 if x == 'nan' 
#                                                                     else(0 if x == 'little' 
#                                                                     else(1 if x == 'moderate'
#                                                                     else(3 if x == 'quite rich' 
#                                                                     else 4))))

In [50]:
# test_data['Housing'] = test_data['Housing'].apply(lambda x: 0 if x == 'free' else (2 if x == 'own' else 1))
# test_data['Saving accounts'] = test_data['Saving accounts'].apply(lambda x: 0 if x == 'nan' 
#                                                                     else(1 if x == 'little' 
#                                                                     else(2 if x == 'moderate'
#                                                                     else(3 if x == 'quite rich' 
#                                                                     else 4))))
                                                                    
# test_data['Checking account'] = test_data['Checking account'].apply(lambda x: 0 if x == 'nan' 
#                                                                     else(1 if x == 'little' 
#                                                                     else(2 if x == 'moderate'
#                                                                     else(3 if x == 'quite rich' 
#                                                                     else 4))))

In [51]:
# train_data['Purpose'] = train_data['Purpose'].apply(lambda x: 0 if x == 'education' 
#                                                                     else(2 if x == 'radio/TV'                                                             
#                                                                     else 1))
# test_data['Purpose'] = test_data['Purpose'].apply(lambda x: 0 if x == 'education' 
#                                                                 else(2 if x == 'radio/TV'                                                             
#                                                                 else 1))

In [52]:
# train_data['Sex'] = train_data['Sex'].apply(lambda x: 0 if x == 'female' else 1)
# test_data['Sex'] = test_data['Sex'].apply(lambda x: 0 if x == 'female' else 1)


In [53]:
numeric_cols = ['Age','Job','Credit amount','Duration','Housing','Saving accounts','Checking account','Sex','Purpose']
categorical_cols = ['Sex','Purpose']

In [54]:
print(np.min(train_data['Credit amount']),np.max(train_data['Credit amount']))
print(np.min(test_data['Credit amount']),np.max(test_data['Credit amount']))

276 15945
250 18424


In [55]:
scaler = MinMaxScaler().fit(train_data[numeric_cols])
train_data[numeric_cols] = scaler.transform(train_data[numeric_cols])
# train_data = get_dummies(train_data, categorical_cols)
train_data, train_labels = SMOTE().fit_resample(train_data, train_labels)
train_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0.000000,0.0,0.666667,1.000000,0.184577,1.000000,0.514328,0.571429,0.377499
1,0.053723,0.0,0.666667,0.000000,1.000000,0.000000,0.033697,0.142857,0.000000
2,1.000000,0.0,0.666667,0.000000,1.000000,1.000000,0.325164,0.571429,0.454688
3,1.000000,1.0,0.000000,0.000000,0.184577,0.757993,0.237029,0.571429,0.340491
4,1.000000,1.0,0.666667,0.000000,0.184577,0.757993,0.524028,0.785714,0.377499
...,...,...,...,...,...,...,...,...,...
1115,1.000000,1.0,0.666667,0.924363,0.183792,0.757993,0.109159,0.253055,0.452487
1116,1.000000,1.0,0.666667,0.924363,1.000000,1.000000,0.166960,0.264219,0.444444
1117,1.000000,1.0,0.333333,0.000000,1.000000,0.877745,0.173855,0.251108,0.416493
1118,0.988931,0.0,0.621822,0.000000,0.180875,0.967442,0.154444,0.328314,0.393517


In [56]:
for col in train_data.columns:
    print(col,train_data[col].unique())

Age [0.00000000e+00 5.37227251e-02 1.00000000e+00 9.17719678e-01
 2.10573308e-02 2.86867155e-02 2.28803042e-02 3.44404479e-02
 9.20867581e-01 9.98782111e-01 9.89790819e-01 9.95231800e-01
 9.47933654e-01 9.80187300e-01 9.98546880e-01 9.42479639e-01
 1.33169834e-04 4.56143827e-02 2.85241904e-02 3.71589162e-02
 1.34758639e-02 9.24270096e-01 3.64730886e-03 1.01221630e-02
 1.30639541e-02 4.28669898e-03 5.27470021e-02 7.90582499e-03
 6.50961492e-03 9.77326593e-03 2.24486241e-02 1.53946846e-02
 9.25063382e-01 4.24634618e-03 4.71489822e-02 7.31519490e-01
 9.64153045e-01 9.77459797e-01 3.39860370e-02 1.70437989e-04
 4.12665278e-02 9.88600662e-01 4.59051169e-02 4.02767600e-02
 9.18010458e-03 9.69694726e-03 7.44753956e-03 9.68863341e-01
 4.15255827e-02 9.40664493e-01 5.25092337e-02 3.26553689e-02
 2.24013686e-02 9.40404308e-01 5.19837642e-02 1.28208269e-02
 2.30575507e-03 6.67392323e-03 1.54528717e-02 2.25499115e-02
 1.51528002e-02 9.63442934e-01 9.54370144e-01 9.32358595e-01
 2.33546267e-02 9.90

In [57]:

test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])
# test_data = get_dummies(test_data, categorical_cols)
test_data


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0.053723,0.0,0.666667,0.0,0.000000,0.757993,0.104474,0.250000,0.455696
1,0.053723,0.0,1.000000,0.0,1.000000,0.757993,0.101091,0.142857,0.377499
2,0.000000,0.0,0.666667,0.0,1.000000,1.000000,0.047738,0.357143,0.000000
3,0.053723,0.0,0.666667,0.0,1.000000,1.000000,0.180484,0.250000,0.455696
4,1.000000,0.0,0.333333,0.0,0.157058,0.757993,0.143213,0.410714,0.000000
...,...,...,...,...,...,...,...,...,...
195,0.000000,0.0,0.666667,1.0,1.000000,1.000000,0.436722,0.785714,0.454688
196,0.000000,0.0,0.666667,0.0,0.184577,0.757993,0.137277,0.250000,0.455696
197,1.000000,0.0,0.666667,0.0,1.000000,0.000000,0.062927,0.142857,0.000000
198,0.053723,0.0,0.666667,0.0,0.184577,1.000000,0.387644,0.464286,0.454688


In [58]:
# feat_weight = np.full(17,1.0)
# feat_weight[0:7] = [1,1,1,2,2,2,1.5]
# feat_weight[7:9] = [0.5,0.5]
# feat_weight[9:17] = [1/3]*8
# feat_weight

In [59]:
# i = 0
# for col in train_data.columns:
#     train_data[col] =  train_data[col]*feat_weight[i]
#     test_data[col] =  test_data[col]*feat_weight[i]
#     i+=1

In [60]:
train_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0.000000,0.0,0.666667,1.000000,0.184577,1.000000,0.514328,0.571429,0.377499
1,0.053723,0.0,0.666667,0.000000,1.000000,0.000000,0.033697,0.142857,0.000000
2,1.000000,0.0,0.666667,0.000000,1.000000,1.000000,0.325164,0.571429,0.454688
3,1.000000,1.0,0.000000,0.000000,0.184577,0.757993,0.237029,0.571429,0.340491
4,1.000000,1.0,0.666667,0.000000,0.184577,0.757993,0.524028,0.785714,0.377499
...,...,...,...,...,...,...,...,...,...
1115,1.000000,1.0,0.666667,0.924363,0.183792,0.757993,0.109159,0.253055,0.452487
1116,1.000000,1.0,0.666667,0.924363,1.000000,1.000000,0.166960,0.264219,0.444444
1117,1.000000,1.0,0.333333,0.000000,1.000000,0.877745,0.173855,0.251108,0.416493
1118,0.988931,0.0,0.621822,0.000000,0.180875,0.967442,0.154444,0.328314,0.393517


In [61]:
test_data

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0.053723,0.0,0.666667,0.0,0.000000,0.757993,0.104474,0.250000,0.455696
1,0.053723,0.0,1.000000,0.0,1.000000,0.757993,0.101091,0.142857,0.377499
2,0.000000,0.0,0.666667,0.0,1.000000,1.000000,0.047738,0.357143,0.000000
3,0.053723,0.0,0.666667,0.0,1.000000,1.000000,0.180484,0.250000,0.455696
4,1.000000,0.0,0.333333,0.0,0.157058,0.757993,0.143213,0.410714,0.000000
...,...,...,...,...,...,...,...,...,...
195,0.000000,0.0,0.666667,1.0,1.000000,1.000000,0.436722,0.785714,0.454688
196,0.000000,0.0,0.666667,0.0,0.184577,0.757993,0.137277,0.250000,0.455696
197,1.000000,0.0,0.666667,0.0,1.000000,0.000000,0.062927,0.142857,0.000000
198,0.053723,0.0,0.666667,0.0,0.184577,1.000000,0.387644,0.464286,0.454688


In [62]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_data,train_labels)

In [63]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7 ,weights='distance')
knn.fit(train_data, train_labels)


In [64]:
pred1 = gnb.predict(test_data)
print(pred1,len(np.where(pred1==1)[0]))
from sklearn.metrics import classification_report
print(classification_report(y_pred=pred1, y_true = test_labels))

[0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1
 1 0 1 1 1 0 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0
 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 1 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 1 1 0 0
 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 0 1 1 1 0
 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0] 87
              precision    recall  f1-score   support

           0       0.87      0.70      0.77       140
           1       0.52      0.75      0.61        60

    accuracy                           0.71       200
   macro avg       0.69      0.72      0.69       200
weighted avg       0.76      0.71      0.73       200



In [65]:
pred = knn.predict(test_data)
print(pred,len(np.where(pred==1)[0]))
from sklearn.metrics import classification_report
print(classification_report(y_pred=pred, y_true = test_labels))

[0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0
 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 0 0 1 0
 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 1
 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0] 84
              precision    recall  f1-score   support

           0       0.81      0.67      0.73       140
           1       0.45      0.63      0.53        60

    accuracy                           0.66       200
   macro avg       0.63      0.65      0.63       200
weighted avg       0.70      0.66      0.67       200



In [66]:
rf = RandomForestClassifier()
rf.fit(train_data,train_labels)
pred2 = rf.predict(test_data)
print(classification_report(y_pred=pred2, y_true = test_labels))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       140
           1       0.56      0.53      0.55        60

    accuracy                           0.73       200
   macro avg       0.68      0.68      0.68       200
weighted avg       0.73      0.73      0.73       200

