In [1056]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from fancyimpute import KNN
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [1057]:
data = pd.read_csv('../raw_data/dataset.csv',sep=';')
data = data.copy()
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [1058]:
data.shape

(99976, 43)

## 1. Preprocessing workflow 

### 1.1. Duplicates

In [1059]:
# Duplicates
duplicate_count = data.duplicated().sum()
duplicate_count

0

 ### 1.2. Missing values 

In [1060]:
# Missing data percentage
# round((data.isnull().sum()/len(data)).sort_values(ascending=False),2)

#### we do have a few features having a high percetange of missing values 
#### Let's investigate the meaning of these features and the besty way to deal with mising values

In [1061]:
# unique values in this feature
data['worst_status_active_inv'].unique()

array([ 1., nan,  2.,  3.])

In [1062]:
# account_worst_status_12_24m
data['account_worst_status_12_24m'].unique()

array([nan,  1.,  2.,  3.,  4.])

In [1063]:
data['account_worst_status_12_24m'].isna().sum() / len(data['account_worst_status_12_24m'])

0.6677702648635673

In [1065]:
# let's drop this worst_status_active_inv feature it has a too high missing values 
# del data['worst_status_active_inv']


##  Handling Missing Data with Imputation

## Preprocessing
### Pipeline 

In [1066]:
#########               ###############
# that was I what I did it 
# step 0 
### missing_feat = data[data.columns[data.isnull().any()].tolist()]
#### missing_feat
# # Step 1: Separate the null values from the dataframe (data) and create a variable “test data”
# test_data = data[data[missing_feat.columns].isnull().any(axis=1)]
# # Step 2: Drop the null values from the dataframe (data) and represent them as ‘train data”
# data.dropna(inplace = True)
# # Step 3: Create “x_train” & “y_train” from train data.
# X_train = data.drop(missing_feat.columns, axis=1)
# # Step 3: Create “x_train” & “y_train” from train data
# y_train = data[missing_feat.columns]
# # necessary to covert to float
# X_train = X_train.reset_index()
# X_train = X_train.drop(['uuid'], axis=1)


In [1067]:
# converting these columns to "object" type
list_float_to_obj = ["account_status","account_worst_status_0_3m",
                     "account_worst_status_12_24m", "account_worst_status_3_6m", "account_worst_status_6_12m",
                     "status_last_archived_0_24m", "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m",
                     "status_max_archived_0_6_months","status_max_archived_0_12_months","status_max_archived_0_24_months",
                     "has_paid"]

_ = [data.__setitem__(feature, data[feature].astype("object")) for feature in list_float_to_obj]

In [1068]:
# step 0 
missing_feat = data[data.columns[data.isnull().any()].tolist()]

In [1069]:
# Step 1: Separate the null values from the dataframe (data) and create a variable “test data”
test_data = data[data[missing_feat.columns].isnull().any(axis=1)]


In [1070]:
# Step 2: Drop the null values from the dataframe (data) and represent them as ‘train data”
data.dropna(inplace = True)

In [1071]:
# Step 3: Create “x_train” & “y_train” from train data.
X_train = data.drop(missing_feat.columns, axis=1)
# # Step 3: Create “x_train” & “y_train” from train data
y_train = data[missing_feat.columns]

In [1072]:
from sklearn.preprocessing import OneHotEncoder

X_train_ohe = OneHotEncoder(sparse_output=False) # Instanciate One hot encoder

X_train_ohe.fit(X_train.iloc[:, 1:].select_dtypes(include='object')) # Fit one hot encoder

# Get column names after preprocessing
column_names = X_train_ohe.fit(X_train.iloc[:, 1:].select_dtypes(include='object')).get_feature_names_out()

# Create new DataFrame with transformed data and column names
X_train_prepro = pd.DataFrame(X_train_ohe.fit_transform(X_train.iloc[:, 1:].select_dtypes(include='object')), columns=column_names, dtype=np.object_)

In [1073]:
# Step 4: Build the linear regression model
# Build the linear refuressinn model 
# instatiate the model 
log_reg = DecisionTreeRegressor()
# fitting the model 
model_num = log_reg.fit(X_train_prepro, y_train['account_status'])
# Step 5: Create the x_test from test data
X_test = test_data[X_train.columns[1:]]

In [1074]:
X_test_ohe = OneHotEncoder(sparse_output=False) # Instanciate One hot encoder

X_test_ohe.fit(X_test.iloc[:, 1:].select_dtypes(include='object')) # Fit one hot encoder

# Get column names after preprocessing
column_names = X_test_ohe.fit(X_test.iloc[:, 1:].select_dtypes(include='object')).get_feature_names_out()

# Create new DataFrame with transformed data and column names
X_test_prepro = pd.DataFrame(X_test_ohe.fit_transform(X_test.iloc[:, 1:].select_dtypes(include='object')), columns=column_names, dtype=np.object_)

In [1075]:
# Step 6: Apply the model on x_test of test data to make predictions. here, we have created a new variable ‘y_pred’.
y_pred_num = model_num.predict(X_test_prepro[X_train_prepro.columns])
y_pred_num = pd.DataFrame(y_pred_num)
test_data['y_pred_num'] = y_pred_num

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['y_pred_num'] = y_pred_num


In [1076]:
r1_pred = {}
r1_pred['y_pred_num'], r1_pred['account_status'] = test_data[['y_pred_num','account_status']].dropna(axis=0).values.T


In [1077]:
r2_baseline = metrics.r2_score(r1_pred['account_status'], r1_pred['y_pred_num'])
r2_baseline

-0.6156272547752013

In [1078]:
# data_obj_bool = pd.DataFrame()
# data_obj_bool[['merchant_category ','merchant_group','has_paid ','name_in_email']] = data.select_dtypes(include=['object', 'bool'])

In [1079]:
# data.drop(columns=['merchant_category','merchant_group','has_paid','name_in_email'], inplace=True)

In [1080]:
# He's facing the same problem - see working code
# basic I have to scale the data 
# https://datascience.stackexchange.com/questions/77450/knn-imputation-utilize-mean-or-mode
# I have to take a look at what i did in the first version of notebook


In [1081]:
# # Load the data
# X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
# X.replace({None:np.nan}, inplace = True)
# # Some preprocessing to correct data types and replace None with nans for pipeline imputer
# X.drop(["name","home.dest"], axis = 1, inplace = True)
# X["embarked"] = X["embarked"].astype("object")
# X["sex"] = X["sex"].astype("object")
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

#########               ###############
# that was I what I did it 
# step 0 
### missing_feat = data[data.columns[data.isnull().any()].tolist()]
#### missing_feat
# # Step 1: Separate the null values from the dataframe (data) and create a variable “test data”
# test_data = data[data[missing_feat.columns].isnull().any(axis=1)]
# # Step 2: Drop the null values from the dataframe (data) and represent them as ‘train data”
# data.dropna(inplace = True)
# # Step 3: Create “x_train” & “y_train” from train data.
# X_train = data.drop(missing_feat.columns, axis=1)
# # Step 3: Create “x_train” & “y_train” from train data
# y_train = data[missing_feat.columns]
# # necessary to covert to float
# X_train = X_train.reset_index()
# X_train = X_train.drop(['uuid'], axis=1)

In [1082]:
# # removing coulumn "uuid" from the dataset
# data_id = data['uuid']
# #_ = data.reset_index()
# #data = _.drop(['uuid'], axis=1)
# data.drop(['uuid'], axis=1, inplace=True)
# # setting target and removing the "default" from dataset
# target = data['default'].dropna()
# data.dropna(subset=['default'], axis=0, inplace=True)
# data.drop(['default'], axis=1, inplace=True)
# # isna and isnull were not recognizing "nan" 
# data = data.replace('nan', np.NaN)

In [1083]:
# # converting these columns to "object" type
# list_float_to_obj = ["account_status","account_worst_status_0_3m",
#                      "account_worst_status_12_24m", "account_worst_status_3_6m", "account_worst_status_6_12m",
#                      "status_last_archived_0_24m", "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m",
#                      "status_max_archived_0_6_months","status_max_archived_0_12_months","status_max_archived_0_24_months",
#                      "has_paid"]

# _ = [data.__setitem__(feature, data[feature].astype("object")) for feature in list_float_to_obj]

# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

In [1084]:

# cat_prepro = Pipeline([
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# cont_prepro = Pipeline([
#     ("imputer", KNNImputer()),
#     ("scaler", RobustScaler())
# ])

# preproc_baseline = ColumnTransformer([
#     ('num_encoder', cont_prepro, make_column_selector(dtype_include="int64")),
#     ('cat_encoder', cat_prepro, make_column_selector(dtype_include="object"))
# ])

# # Get column names after preprocessing
# column_names = preproc_baseline.fit(X_train).get_feature_names_out()

# # Create new DataFrame with transformed data and column names
# X_train_prepro = pd.DataFrame(preproc_baseline.transform(X_train), columns=column_names, dtype=np.float16)


In [1085]:
# # lets take a look at the preprocessed 
# shape_preproc_baseline = preproc_baseline.fit_transform(X_train).shape
# shape_preproc_baseline

In [1086]:
# tree_reg= DecisionTreeClassifier()
# pipe_baseline = Pipeline([("preprocessor",preproc_baseline),
#                   ('model_dec_three', DecisionTreeClassifier())])
# pipe_baseline

In [1087]:
# # Cross - Validate
# score_baseline = cross_val_score(pipe_baseline, X_train, y_train, cv=5, scoring="roc_auc", error_score='raise').mean()
# score_baseline

In [1088]:
# # Predict y_pred_baseline
# pipe_baseline.fit(X_train,y_train)
# y_pred_baseline = pipe_baseline.predict(X_test)
# y_prob_baseline = pipe_baseline.predict_proba(X_test)

In [1089]:
# import matplotlib.pyplot as plt
# from sklearn import metrics
# from sklearn.metrics import roc_curve


In [1090]:
# import sklearn
# import sys

# print("Python version:", sys.version)
# print("scikit-learn version:", sklearn.__version__)


In [1091]:
# r2_baseline = metrics.r2_score(y_test, y_pred_baseline)
# r2_baseline

In [1092]:
# f"we have got a very poor result of {round(r2_baseline,2)} for the baseline model"