In [2]:
import pandas as pd

from sklearn.linear_model import LinearRegression


In [3]:
data = pd.read_csv('../raw_data/dataset.csv',sep=';')
data = data.copy()
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99976 entries, 0 to 99975
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   uuid                                 99976 non-null  object 
 1   default                              89976 non-null  float64
 2   account_amount_added_12_24m          99976 non-null  int64  
 3   account_days_in_dc_12_24m            88140 non-null  float64
 4   account_days_in_rem_12_24m           88140 non-null  float64
 5   account_days_in_term_12_24m          88140 non-null  float64
 6   account_incoming_debt_vs_paid_0_24m  40661 non-null  float64
 7   account_status                       45603 non-null  float64
 8   account_worst_status_0_3m            45603 non-null  float64
 9   account_worst_status_12_24m          33215 non-null  float64
 10  account_worst_status_3_6m            42274 non-null  float64
 11  account_worst_status_6_12m  

## 1. Preprocessing workflow 

### 1.1. Duplicates

In [5]:
# Duplicates
duplicate_count = data.duplicated().sum()
duplicate_count

0

 ### 1.2. Missing values 

In [6]:
# Missing data percentage
round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)

worst_status_active_inv                0.70
account_worst_status_12_24m            0.67
account_worst_status_6_12m             0.60
account_incoming_debt_vs_paid_0_24m    0.59
account_worst_status_3_6m              0.58
account_status                         0.54
account_worst_status_0_3m              0.54
avg_payment_span_0_3m                  0.49
avg_payment_span_0_12m                 0.24
num_active_div_by_paid_inv_0_12m       0.23
num_arch_written_off_12_24m            0.18
num_arch_written_off_0_12m             0.18
account_days_in_rem_12_24m             0.12
account_days_in_term_12_24m            0.12
account_days_in_dc_12_24m              0.12
default                                0.10
sum_paid_inv_0_12m                     0.00
sum_capital_paid_account_12_24m        0.00
sum_capital_paid_account_0_12m         0.00
recovery_debt                          0.00
status_max_archived_0_24_months        0.00
status_max_archived_0_6_months         0.00
status_3rd_last_archived_0_24m  

#### we do have a few features having a high percetange of missing values 
#### Let's investigate the meaning of these features and the besty way to deal with mising values

In [7]:
# unique values in this feature
data['worst_status_active_inv'].unique()

array([ 1., nan,  2.,  3.])

In [8]:
# account_worst_status_12_24m
data['account_worst_status_12_24m'].unique()

array([nan,  1.,  2.,  3.,  4.])

In [9]:
data['account_worst_status_12_24m'].isna().sum() / len(data['account_worst_status_12_24m'])

0.6677702648635673

In [10]:
_ = round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)
miss_val_features_df = pd.DataFrame(_,columns=['percentage']).reset_index(names=['features'])
miss_val_features_df[miss_val_features_df['percentage'] >= 0.12]


Unnamed: 0,features,percentage
0,worst_status_active_inv,0.7
1,account_worst_status_12_24m,0.67
2,account_worst_status_6_12m,0.6
3,account_incoming_debt_vs_paid_0_24m,0.59
4,account_worst_status_3_6m,0.58
5,account_status,0.54
6,account_worst_status_0_3m,0.54
7,avg_payment_span_0_3m,0.49
8,avg_payment_span_0_12m,0.24
9,num_active_div_by_paid_inv_0_12m,0.23


In [11]:
# let's drop this worst_status_active_inv feature it has a too high missing values 
del data['worst_status_active_inv']
type(data)

pandas.core.frame.DataFrame

In [12]:
# let's try the follwing approach in order to clean this features out
# if the worst status for 12-24 is let's say 1 - 'account_worst_status_12_24m' and if status for 'account_worst_status_6_12m' is NaN 
# then I will replace by 1. Basically, we're saying that nothing has changed.

data[['account_worst_status_12_24m','account_worst_status_6_12m', 'account_worst_status_3_6m']]


Unnamed: 0,account_worst_status_12_24m,account_worst_status_6_12m,account_worst_status_3_6m
0,,,1.0
1,1.0,1.0,1.0
2,,,
3,,,
4,,,
...,...,...,...
99971,,,
99972,,1.0,1.0
99973,2.0,1.0,1.0
99974,1.0,2.0,2.0


## Steps to Predicting the Missing data 

In [13]:
import pandas as pd

# Example dataframe
df = pd.DataFrame({'A': [1, 2, None, 4, 5],
                     'B': [None, 2, 3, 4, None]})

# Separate the null values from the dataframe
null_data = df[df.isnull().any(axis=1)]

# Create a variable "test data"
null_data


Unnamed: 0,A,B
0,1.0,
2,,3.0
4,5.0,


In [14]:
missing_feat = data[data.columns[data.isnull().any()].tolist()]
missing_feat


Unnamed: 0,default,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,account_worst_status_3_6m,account_worst_status_6_12m,avg_payment_span_0_12m,avg_payment_span_0_3m,num_active_div_by_paid_inv_0_12m,num_arch_written_off_0_12m,num_arch_written_off_12_24m
0,0.0,0.0,0.0,0.0,0.000000,1.0,1.0,,1.0,,12.692308,8.333333,0.153846,0.0,0.0
1,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,1.0,1.0,25.833333,25.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,,,,,,,20.000000,18.000000,0.071429,0.0,0.0
3,0.0,,,,,,,,,,4.687500,4.888889,0.031250,0.0,0.0
4,0.0,0.0,0.0,0.0,,,,,,,13.000000,13.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99971,,0.0,0.0,0.0,,1.0,1.0,,,,10.333333,,0.000000,0.0,0.0
99972,,0.0,0.0,0.0,0.004044,1.0,1.0,,1.0,1.0,36.000000,,0.000000,0.0,0.0
99973,,0.0,20.0,0.0,0.705078,2.0,2.0,2.0,1.0,1.0,,,,,
99974,,0.0,0.0,0.0,0.064175,1.0,2.0,1.0,2.0,2.0,17.500000,,0.000000,0.0,0.0


In [15]:
# Step 1: Separate the null values from the dataframe (data) and create a variable “test data”
test_data = data[data[missing_feat.columns].isnull().any(axis=1)]
test_data

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.000000,1.0,1.0,,...,1,1,1,1,1,0,0,0,178839,9.653333
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,1,2,2,0,0,0,49014,13.181389
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,1,2,2,0,0,0,124839,11.561944
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,1,0,0,0,324676,15.751111
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,0,1,1,1,0,0,0,7100,12.698611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99971,5c03bc63-ea65-4ffd-aa7b-95ea9a46db34,,0,0.0,0.0,0.0,,1.0,1.0,,...,1,1,1,1,1,0,0,0,60127,10.765556
99972,f8db22f4-9819-420c-abbc-9ddf1843176e,,0,0.0,0.0,0.0,0.004044,1.0,1.0,,...,1,1,0,1,1,0,7948,0,4740,21.708333
99973,b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8,,45671,0.0,20.0,0.0,0.705078,2.0,2.0,2.0,...,0,0,0,0,0,0,17447,19627,3100,2.185278
99974,bafcab15-9898-479c-b729-c9dda7edb78f,,56102,0.0,0.0,0.0,0.064175,1.0,2.0,1.0,...,1,1,1,1,1,0,18339,56180,34785,9.725278


In [16]:
pd.DataFrame(missing_feat.columns, columns = ['Missing data'])

Unnamed: 0,Missing data
0,default
1,account_days_in_dc_12_24m
2,account_days_in_rem_12_24m
3,account_days_in_term_12_24m
4,account_incoming_debt_vs_paid_0_24m
5,account_status
6,account_worst_status_0_3m
7,account_worst_status_12_24m
8,account_worst_status_3_6m
9,account_worst_status_6_12m


In [17]:
# Step 2: Drop the null values from the dataframe (data) and represent them as ‘train data”
data.dropna(inplace = True)


In [18]:
# Step 3: Create “x_train” & “y_train” from train data.
X_train = data.drop(missing_feat.columns, axis=1)

In [19]:
# Step 3: Create “x_train” & “y_train” from train data
y_train = data[missing_feat.columns]

In [37]:
y_train['default']

(16934,)

In [21]:
# necessary to covert to float
X_train = X_train.reset_index()
X_train = X_train.drop(['uuid'], axis=1)

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler

In [23]:
# I have to select the object type of features 
# see slides and apply OneHotEncoder
cat_preprocessor = ColumnTransformer([
    ('num_encoder', MinMaxScaler(), make_column_selector(dtype_include="int64")),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include="object")),
    ('binary_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False, drop="if_binary"), make_column_selector(dtype_include="bool"))
])
cat_preprocessor

In [24]:
encode_cat = pd.DataFrame(cat_preprocessor.fit_transform(X_train))
encode_cat.shape



(16934, 95)

In [25]:
encode_cat.shape

(16934, 95)

In [26]:
# Step 4: Build the linear regression model
# Build the linear refuressinn model 
# instatiate the model 
lr = LinearRegression()
# fitting the model 
lr.fit(encode_cat, y_train)

In [27]:
# Step 5: Create the x_test from test data
X_test = test_data[X_train.columns[1:]]
X_test

Unnamed: 0,account_amount_added_12_24m,age,merchant_category,merchant_group,has_paid,max_paid_inv_0_12m,max_paid_inv_0_24m,name_in_email,num_active_inv,num_arch_dc_0_12m,...,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours
0,0,20,Dietary supplements,Health & Beauty,True,31638.0,31638.0,no_match,2,0,...,1,1,1,1,1,0,0,0,178839,9.653333
1,0,50,Books & Magazines,Entertainment,True,13749.0,13749.0,F+L,0,0,...,1,1,1,2,2,0,0,0,49014,13.181389
2,0,22,Diversified entertainment,Entertainment,True,29890.0,29890.0,L1+F,1,0,...,1,1,1,2,2,0,0,0,124839,11.561944
3,0,36,Diversified entertainment,Entertainment,True,40040.0,40040.0,F1+L,1,0,...,1,1,1,1,1,0,0,0,324676,15.751111
4,0,25,Electronic equipment & Related accessories,Electronics,True,7100.0,7100.0,F+L,0,0,...,0,0,1,1,1,0,0,0,7100,12.698611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99971,0,33,Electronic equipment & Related accessories,Electronics,True,35195.0,35195.0,F1+L,0,0,...,1,1,1,1,1,0,0,0,60127,10.765556
99972,0,44,Body & Hair Care,Health & Beauty,True,4740.0,4740.0,F1+L,0,0,...,1,1,0,1,1,0,7948,0,4740,21.708333
99973,45671,24,Jewelry & Watches,Jewelry & Accessories,True,1200.0,1200.0,Nick,0,0,...,0,0,0,0,0,0,17447,19627,3100,2.185278
99974,56102,31,Decoration & Art,Home & Garden,True,15000.0,15000.0,Nick,0,0,...,1,1,1,1,1,0,18339,56180,34785,9.725278


In [28]:
encoded_X_test = pd.DataFrame(cat_preprocessor.fit_transform(X_test))
encoded_X_test



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0.000000,0.024390,0.052632,0.0,0.0,0.058824,0.044728,0.000000,0.012346,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.000000,0.390244,0.000000,0.0,0.0,0.040724,0.060703,0.071429,0.000000,0.2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.000000,0.048780,0.026316,0.0,0.0,0.049774,0.000000,0.071429,0.006173,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.000000,0.219512,0.026316,0.0,0.0,0.140271,0.067093,0.000000,0.006173,0.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.000000,0.085366,0.000000,0.0,0.0,0.004525,0.000000,0.000000,0.000000,0.2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83037,0.000000,0.182927,0.000000,0.0,0.0,0.027149,0.006390,0.000000,0.000000,0.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
83038,0.000000,0.317073,0.000000,0.0,0.0,0.004525,0.009585,0.000000,0.006173,0.2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
83039,0.047402,0.073171,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
83040,0.058229,0.158537,0.000000,0.0,0.0,0.009050,0.003195,0.000000,0.006173,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [29]:
# Step 6: Apply the model on x_test of test data to make predictions. here, we have created a new variable ‘y_pred’.
y_pred = lr.predict(X_test)
y_pred



ValueError: could not convert string to float: 'Dietary supplements'

# Filling missing values "account_worst_status"
data['account_worst_status_12_24m'].fillna(data['account_worst_status_6_12m'],inplace=True)

# Filling missing values "account_worst_status"
data['account_worst_status_6_12m'].fillna(data['account_worst_status_3_6m'],inplace=True)

In [None]:
# I have to study see how I could do clustering to deal with this missing values


### K-Nearest Neighbor 


In [None]:
encode_data = pd.DataFrame(cat_preprocessor.fit_transform(data))
encode_data.shape



(16934, 17028)

In [None]:
from fancyimpute import KNN
knn_imputer = KNN()
# imputing the missing value with knn imputer
data = knn_imputer.fit_transform(data)

NameError: name 'data' is not defined

In [None]:
data

NameError: name 'data' is not defined

In [41]:
df = pd.read_csv('../raw_data/data_clean.csv')

In [44]:
df

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,merchant_category,merchant_group,has_paid,name_in_email
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,1.0,1.5,...,1.0,0.0,0.0,0.0,178839.0,9.653333,Dietary supplements,Health & Beauty,True,no_match
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0.0,0.0,0.0,0.0,0.036851,1.0,1.0,1.0,...,2.0,0.0,0.0,0.0,49014.0,13.181389,Books & Magazines,Entertainment,True,F+L
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0.0,0.0,0.0,0.0,0.000079,1.0,1.0,1.0,...,2.0,0.0,0.0,0.0,124839.0,11.561944,Diversified entertainment,Entertainment,True,L1+F
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0.0,0.0,0.0,0.0,1.184280,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,324676.0,15.751111,Diversified entertainment,Entertainment,True,F1+L
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,1.0,1.5,...,1.0,0.0,0.0,0.0,7100.0,12.698611,Electronic equipment & Related accessories,Electronics,True,F+L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99971,5c03bc63-ea65-4ffd-aa7b-95ea9a46db34,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,60127.0,10.765556,Electronic equipment & Related accessories,Electronics,True,F1+L
99972,f8db22f4-9819-420c-abbc-9ddf1843176e,0.0,0.0,0.0,0.0,0.0,0.004044,1.0,1.0,1.0,...,1.0,0.0,7948.0,0.0,4740.0,21.708333,Body & Hair Care,Health & Beauty,True,F1+L
99973,b22e21ea-b1b2-4df3-b236-0ff6d5fdc0d8,0.0,45671.0,0.0,20.0,0.0,0.705078,2.0,2.0,2.0,...,0.0,0.0,17447.0,19627.0,3100.0,2.185278,Jewelry & Watches,Jewelry & Accessories,True,Nick
99974,bafcab15-9898-479c-b729-c9dda7edb78f,0.0,56102.0,0.0,0.0,0.0,0.064175,1.0,2.0,1.0,...,1.0,0.0,18339.0,56180.0,34785.0,9.725278,Decoration & Art,Home & Garden,True,Nick
