In [None]:
# This notebook will be for clustering modeling (Unsupervised Learning)
# Mini Kmeans, maybe DBSCAN(Density-Based Spatial Clustering of Applications with Noise)(GOWER) to detect noise
# Importing the libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.decomposition import PCA

In [25]:
# importing data

from dotenv import load_dotenv

load_dotenv()

dataset_path = os.getenv('CLEANED_DATA_PATH')

df_cleaned = pd.read_csv(dataset_path)

In [26]:
# Can be useful in segmentation and stratification. Annomalies can be detected using clustering algorithms, e.g. fraud detection
# Could find some hidden patterns in the data, from there can go back to classification and add more features to the model
# Preprocessing: scaling, encoding, flag missingness (not same as imputation), outlier detection

df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307507 entries, 0 to 307506
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307507 non-null  int64  
 1   TARGET                       307507 non-null  int64  
 2   NAME_CONTRACT_TYPE           307507 non-null  object 
 3   CODE_GENDER                  307507 non-null  object 
 4   FLAG_OWN_CAR                 307507 non-null  object 
 5   FLAG_OWN_REALTY              307507 non-null  object 
 6   CNT_CHILDREN                 307507 non-null  int64  
 7   AMT_GOODS_PRICE              307229 non-null  float64
 8   NAME_TYPE_SUITE              306215 non-null  object 
 9   NAME_INCOME_TYPE             307507 non-null  object 
 10  NAME_EDUCATION_TYPE          307507 non-null  object 
 11  NAME_FAMILY_STATUS           307507 non-null  object 
 12  NAME_HOUSING_TYPE            307507 non-null  object 
 13 

# PREPROCESSING - SCALING, ENCODING, PCA, FLAG NULLS

In [27]:
# dropping the target and id columns
df_cleaned = df_cleaned.drop(['SK_ID_CURR', 'TARGET'], axis=1)

In [28]:
# creating list of categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'AMT_CREDIT_RANGE']

In [29]:
# creating list of numerical columns
numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

['CNT_CHILDREN',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_3',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'CREDT_INCOME_RATIO',
 'EMP_AGE_RATIO',
 'ANNUITY_INCOME_RATIO']

In [66]:
df_num = df_cleaned.drop(labels=categorical_cols, axis=1)

In [67]:
df_num

Unnamed: 0,CNT_CHILDREN,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,FLAG_WORK_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,...,EXT_SOURCE_3,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_HOUR,CREDT_INCOME_RATIO,EMP_AGE_RATIO,ANNUITY_INCOME_RATIO
0,0,351000.0,0.018801,3648.0,2120,1,0,1.0,2,10,...,0.139376,2.0,2.0,2.0,1134.0,1,0.0,2.007889,0.069808,0.121978
1,0,1129500.0,0.003541,1186.0,291,1,0,2.0,1,11,...,,0.0,1.0,0.0,828.0,1,0.0,4.790750,0.072329,0.132217
2,0,135000.0,0.010032,4260.0,2531,1,1,1.0,2,9,...,0.729567,0.0,0.0,0.0,815.0,0,0.0,2.000000,0.011855,0.100000
3,0,297000.0,0.008019,9833.0,2437,1,0,2.0,2,17,...,,0.0,2.0,0.0,617.0,1,,2.316167,0.160116,0.219900
4,0,513000.0,0.028663,4311.0,3458,1,0,1.0,2,11,...,,0.0,0.0,0.0,1106.0,0,0.0,4.222222,0.154135,0.179963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0,225000.0,0.032561,8456.0,1982,1,0,1.0,1,15,...,,0.0,0.0,0.0,273.0,0,,1.617143,0.025863,0.174971
307503,0,225000.0,0.025164,4388.0,4090,0,0,1.0,2,8,...,,0.0,0.0,0.0,0.0,1,,3.743750,17.869031,0.166687
307504,0,585000.0,0.005002,6737.0,5150,1,0,1.0,3,9,...,0.218859,0.0,6.0,0.0,1909.0,1,1.0,4.429176,0.529302,0.195941
307505,0,319500.0,0.005313,2562.0,931,1,0,2.0,2,9,...,0.661024,0.0,0.0,0.0,322.0,1,0.0,2.164368,0.409760,0.118158


In [68]:
scaler = RobustScaler()
df_num[numerical_cols] = scaler.fit_transform(df_num[numerical_cols])

In [69]:
df_num

Unnamed: 0,CNT_CHILDREN,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,FLAG_WORK_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,...,EXT_SOURCE_3,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_HOUR,CREDT_INCOME_RATIO,EMP_AGE_RATIO,ANNUITY_INCOME_RATIO
0,0.0,-0.224490,-0.002626,-0.156504,-0.439705,0.0,0.0,-1.0,0.0,-0.50,...,-1.326713,2.0,1.0,2.0,0.290895,0.0,0.0,-0.400217,-0.290838,-0.357494
1,0.0,1.540816,-0.820550,-0.606637,-1.148895,0.0,0.0,0.0,-1.0,-0.25,...,,0.0,0.5,0.0,0.054784,0.0,0.0,0.485694,-0.282515,-0.267902
2,0.0,-0.714286,-0.472638,-0.044611,-0.280341,0.0,1.0,-1.0,0.0,-0.75,...,0.651092,0.0,0.0,0.0,0.044753,-1.0,0.0,-0.402728,-0.482196,-0.549803
3,0.0,-0.346939,-0.580533,0.974312,-0.316789,0.0,0.0,0.0,0.0,1.25,...,,0.0,1.0,0.0,-0.108025,0.0,,-0.302078,0.007352,0.499344
4,0.0,0.142857,0.525969,-0.035287,0.079100,0.0,0.0,-1.0,0.0,-0.25,...,,0.0,0.0,0.0,0.269290,-1.0,0.0,0.304706,-0.012397,0.149887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0.0,-0.510204,0.734898,0.722552,-0.493214,0.0,0.0,-1.0,-1.0,0.75,...,,0.0,0.0,0.0,-0.373457,-1.0,,-0.524609,-0.435941,0.106211
307503,0.0,-0.510204,0.338425,-0.021209,0.324157,-1.0,0.0,-1.0,0.0,-1.00,...,,0.0,0.0,0.0,-0.584105,0.0,,0.152387,58.480877,0.033725
307504,0.0,0.306122,-0.742242,0.408264,0.735169,0.0,0.0,-1.0,1.0,-0.75,...,-1.060354,0.0,3.0,0.0,0.888889,0.0,1.0,0.370589,1.226376,0.289700
307505,0.0,-0.295918,-0.725572,-0.355060,-0.900737,0.0,0.0,0.0,0.0,-0.75,...,0.421395,0.0,0.0,0.0,-0.335648,0.0,0.0,-0.350402,0.831659,-0.390918


In [70]:
imputer = ColumnTransformer(transformers=[('numimp', SimpleImputer(strategy='median', add_indicator=False), numerical_cols)], 
                    remainder="passthrough").set_output(transform='pandas')     

df_num = imputer.fit_transform(df_num)

In [71]:
df_num

Unnamed: 0,numimp__CNT_CHILDREN,numimp__AMT_GOODS_PRICE,numimp__REGION_POPULATION_RELATIVE,numimp__DAYS_REGISTRATION,numimp__DAYS_ID_PUBLISH,numimp__FLAG_EMP_PHONE,numimp__FLAG_WORK_PHONE,numimp__CNT_FAM_MEMBERS,numimp__REGION_RATING_CLIENT_W_CITY,numimp__HOUR_APPR_PROCESS_START,...,numimp__EXT_SOURCE_3,numimp__DEF_30_CNT_SOCIAL_CIRCLE,numimp__OBS_60_CNT_SOCIAL_CIRCLE,numimp__DEF_60_CNT_SOCIAL_CIRCLE,numimp__DAYS_LAST_PHONE_CHANGE,numimp__FLAG_DOCUMENT_3,numimp__AMT_REQ_CREDIT_BUREAU_HOUR,numimp__CREDT_INCOME_RATIO,numimp__EMP_AGE_RATIO,numimp__ANNUITY_INCOME_RATIO
0,0.0,-0.224490,-0.002626,-0.156504,-0.439705,0.0,0.0,-1.0,0.0,-0.50,...,-1.326713,2.0,1.0,2.0,0.290895,0.0,0.0,-0.400217,-0.290838,-0.357494
1,0.0,1.540816,-0.820550,-0.606637,-1.148895,0.0,0.0,0.0,-1.0,-0.25,...,0.000000,0.0,0.5,0.0,0.054784,0.0,0.0,0.485694,-0.282515,-0.267902
2,0.0,-0.714286,-0.472638,-0.044611,-0.280341,0.0,1.0,-1.0,0.0,-0.75,...,0.651092,0.0,0.0,0.0,0.044753,-1.0,0.0,-0.402728,-0.482196,-0.549803
3,0.0,-0.346939,-0.580533,0.974312,-0.316789,0.0,0.0,0.0,0.0,1.25,...,0.000000,0.0,1.0,0.0,-0.108025,0.0,0.0,-0.302078,0.007352,0.499344
4,0.0,0.142857,0.525969,-0.035287,0.079100,0.0,0.0,-1.0,0.0,-0.25,...,0.000000,0.0,0.0,0.0,0.269290,-1.0,0.0,0.304706,-0.012397,0.149887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0.0,-0.510204,0.734898,0.722552,-0.493214,0.0,0.0,-1.0,-1.0,0.75,...,0.000000,0.0,0.0,0.0,-0.373457,-1.0,0.0,-0.524609,-0.435941,0.106211
307503,0.0,-0.510204,0.338425,-0.021209,0.324157,-1.0,0.0,-1.0,0.0,-1.00,...,0.000000,0.0,0.0,0.0,-0.584105,0.0,0.0,0.152387,58.480877,0.033725
307504,0.0,0.306122,-0.742242,0.408264,0.735169,0.0,0.0,-1.0,1.0,-0.75,...,-1.060354,0.0,3.0,0.0,0.888889,0.0,1.0,0.370589,1.226376,0.289700
307505,0.0,-0.295918,-0.725572,-0.355060,-0.900737,0.0,0.0,0.0,0.0,-0.75,...,0.421395,0.0,0.0,0.0,-0.335648,0.0,0.0,-0.350402,0.831659,-0.390918


In [72]:
# Multicollinearity detection
vif = pd.DataFrame()

# point here suspicious variables or just all variables
df_vif = add_constant(df_num)
vif["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]
vif["variables"] = df_vif.columns

print(vif)
# vif 1-5 - ok, there is no strong correlation
# if vif more than 5 - there is strong correlation
# if vif is inf - there is extremely high correlation (linear dependence)

          VIF                            variables
0    4.352047                                const
1    4.551642                 numimp__CNT_CHILDREN
2    1.971280              numimp__AMT_GOODS_PRICE
3    1.416663   numimp__REGION_POPULATION_RELATIVE
4    1.096243            numimp__DAYS_REGISTRATION
5    1.133248              numimp__DAYS_ID_PUBLISH
6   52.906738               numimp__FLAG_EMP_PHONE
7    1.075683              numimp__FLAG_WORK_PHONE
8    4.550826              numimp__CNT_FAM_MEMBERS
9    1.565395  numimp__REGION_RATING_CLIENT_W_CITY
10   1.099612      numimp__HOUR_APPR_PROCESS_START
11   2.428668   numimp__REG_REGION_NOT_LIVE_REGION
12   9.051204   numimp__REG_REGION_NOT_WORK_REGION
13   7.339518  numimp__LIVE_REGION_NOT_WORK_REGION
14   2.427155       numimp__REG_CITY_NOT_LIVE_CITY
15   7.479011       numimp__REG_CITY_NOT_WORK_CITY
16   6.015205      numimp__LIVE_CITY_NOT_WORK_CITY
17   1.179457                 numimp__EXT_SOURCE_2
18   1.049982                 n

In [73]:
df_num = df_num.drop(labels=['numimp__FLAG_EMP_PHONE', 'numimp__REG_REGION_NOT_WORK_REGION', 
                    'numimp__REG_CITY_NOT_WORK_CITY', 'numimp__LIVE_CITY_NOT_WORK_CITY'], axis=1)

In [74]:
df_num

Unnamed: 0,numimp__CNT_CHILDREN,numimp__AMT_GOODS_PRICE,numimp__REGION_POPULATION_RELATIVE,numimp__DAYS_REGISTRATION,numimp__DAYS_ID_PUBLISH,numimp__FLAG_WORK_PHONE,numimp__CNT_FAM_MEMBERS,numimp__REGION_RATING_CLIENT_W_CITY,numimp__HOUR_APPR_PROCESS_START,numimp__REG_REGION_NOT_LIVE_REGION,...,numimp__EXT_SOURCE_3,numimp__DEF_30_CNT_SOCIAL_CIRCLE,numimp__OBS_60_CNT_SOCIAL_CIRCLE,numimp__DEF_60_CNT_SOCIAL_CIRCLE,numimp__DAYS_LAST_PHONE_CHANGE,numimp__FLAG_DOCUMENT_3,numimp__AMT_REQ_CREDIT_BUREAU_HOUR,numimp__CREDT_INCOME_RATIO,numimp__EMP_AGE_RATIO,numimp__ANNUITY_INCOME_RATIO
0,0.0,-0.224490,-0.002626,-0.156504,-0.439705,0.0,-1.0,0.0,-0.50,0.0,...,-1.326713,2.0,1.0,2.0,0.290895,0.0,0.0,-0.400217,-0.290838,-0.357494
1,0.0,1.540816,-0.820550,-0.606637,-1.148895,0.0,0.0,-1.0,-0.25,0.0,...,0.000000,0.0,0.5,0.0,0.054784,0.0,0.0,0.485694,-0.282515,-0.267902
2,0.0,-0.714286,-0.472638,-0.044611,-0.280341,1.0,-1.0,0.0,-0.75,0.0,...,0.651092,0.0,0.0,0.0,0.044753,-1.0,0.0,-0.402728,-0.482196,-0.549803
3,0.0,-0.346939,-0.580533,0.974312,-0.316789,0.0,0.0,0.0,1.25,0.0,...,0.000000,0.0,1.0,0.0,-0.108025,0.0,0.0,-0.302078,0.007352,0.499344
4,0.0,0.142857,0.525969,-0.035287,0.079100,0.0,-1.0,0.0,-0.25,0.0,...,0.000000,0.0,0.0,0.0,0.269290,-1.0,0.0,0.304706,-0.012397,0.149887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0.0,-0.510204,0.734898,0.722552,-0.493214,0.0,-1.0,-1.0,0.75,0.0,...,0.000000,0.0,0.0,0.0,-0.373457,-1.0,0.0,-0.524609,-0.435941,0.106211
307503,0.0,-0.510204,0.338425,-0.021209,0.324157,0.0,-1.0,0.0,-1.00,0.0,...,0.000000,0.0,0.0,0.0,-0.584105,0.0,0.0,0.152387,58.480877,0.033725
307504,0.0,0.306122,-0.742242,0.408264,0.735169,0.0,-1.0,1.0,-0.75,0.0,...,-1.060354,0.0,3.0,0.0,0.888889,0.0,1.0,0.370589,1.226376,0.289700
307505,0.0,-0.295918,-0.725572,-0.355060,-0.900737,0.0,0.0,0.0,-0.75,0.0,...,0.421395,0.0,0.0,0.0,-0.335648,0.0,0.0,-0.350402,0.831659,-0.390918


In [75]:
# Multicollinearity detection
vif = pd.DataFrame()

# point here suspicious variables or just all variables
df_vif = add_constant(df_num)
vif["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]
vif["variables"] = df_vif.columns

print(vif)

         VIF                            variables
0   4.025164                                const
1   4.547696                 numimp__CNT_CHILDREN
2   1.963981              numimp__AMT_GOODS_PRICE
3   1.413996   numimp__REGION_POPULATION_RELATIVE
4   1.092057            numimp__DAYS_REGISTRATION
5   1.105969              numimp__DAYS_ID_PUBLISH
6   1.070220              numimp__FLAG_WORK_PHONE
7   4.542888              numimp__CNT_FAM_MEMBERS
8   1.564306  numimp__REGION_RATING_CLIENT_W_CITY
9   1.098939      numimp__HOUR_APPR_PROCESS_START
10  1.152151   numimp__REG_REGION_NOT_LIVE_REGION
11  1.044908  numimp__LIVE_REGION_NOT_WORK_REGION
12  1.153400       numimp__REG_CITY_NOT_LIVE_CITY
13  1.175595                 numimp__EXT_SOURCE_2
14  1.049501                 numimp__EXT_SOURCE_3
15  4.064887     numimp__DEF_30_CNT_SOCIAL_CIRCLE
16  1.132232     numimp__OBS_60_CNT_SOCIAL_CIRCLE
17  3.869816     numimp__DEF_60_CNT_SOCIAL_CIRCLE
18  1.067072       numimp__DAYS_LAST_PHONE_CHANGE


In [77]:
db = DBSCAN(eps=0.3, min_samples=10).fit(df_num)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 0
Estimated number of noise points: 307507


In [76]:
pca = PCA(n_components=0.95)
numpy_arr = pca.fit_transform(df_num)
df_reduced = pd.DataFrame(data=numpy_arr.flatten())

In [None]:
# ENCODING:

# DBSCAN
