In [None]:
# This notebook will be for clustering modeling (Unsupervised Learning)
# Mini Kmeans, maybe DBSCAN(Density-Based Spatial Clustering of Applications with Noise)(GOWER) to detect noise
# Importing the libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant
from sklearn.decomposition import PCA

In [8]:
# importing data

from dotenv import load_dotenv

load_dotenv()

dataset_path = os.getenv('CLEANED_DATA_PATH')

df_cleaned = pd.read_csv(dataset_path)

In [4]:
# Can be useful in segmentation and stratification. Annomalies can be detected using clustering algorithms, e.g. fraud detection
# Could find some hidden patterns in the data, from there can go back to classification and add more features to the model
# Preprocessing: scaling, encoding, flag missingness (not same as imputation), outlier detection

df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307507 entries, 0 to 307506
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307507 non-null  int64  
 1   TARGET                       307507 non-null  int64  
 2   NAME_CONTRACT_TYPE           307507 non-null  object 
 3   CODE_GENDER                  307507 non-null  object 
 4   FLAG_OWN_CAR                 307507 non-null  object 
 5   FLAG_OWN_REALTY              307507 non-null  object 
 6   CNT_CHILDREN                 307507 non-null  int64  
 7   AMT_GOODS_PRICE              307229 non-null  float64
 8   NAME_TYPE_SUITE              306215 non-null  object 
 9   NAME_INCOME_TYPE             307507 non-null  object 
 10  NAME_EDUCATION_TYPE          307507 non-null  object 
 11  NAME_FAMILY_STATUS           307507 non-null  object 
 12  NAME_HOUSING_TYPE            307507 non-null  object 
 13 

# PREPROCESSING - SCALING, ENCODING, PCA, FLAG NULLS

In [12]:
# dropping the target and id columns
df_cleaned = df_cleaned.drop(['SK_ID_CURR', 'TARGET'], axis=1)

In [13]:
# creating list of categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'AMT_CREDIT_RANGE']

In [14]:
# creating list of numerical columns
numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

['CNT_CHILDREN',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_3',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'CREDT_INCOME_RATIO',
 'EMP_AGE_RATIO',
 'ANNUITY_INCOME_RATIO']

In [17]:
df_num = df_cleaned[numerical_cols]

In [23]:
scaler = RobustScaler()
df_num = scaler.fit_transform(df_num)

imputer = SimpleImputer(strategy='median', add_indicator=True)

df_num = imputer.fit_transform(df_num)

df_num = pd.DataFrame(df_num)

In [24]:
# Multicollinearity detection
vif = pd.DataFrame()

# point here suspicious variables or just all variables
df_vif = add_constant(df_num)
vif["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]
vif["variables"] = df_vif.columns

print(vif)
# vif 1-5 - ok, there is no strong correlation
# if vif more than 5 - there is strong correlation
# if vif is inf - there is extremely high correlation (linear dependence)

  vif = 1. / (1. - r_squared_i)


          VIF variables
0    4.582794     const
1    4.553388         0
2    1.974923         1
3    1.417656         2
4    1.096292         3
5    1.141985         4
6   52.924449         5
7    1.078651         6
8    4.554773         7
9    1.565994         8
10   1.100579         9
11   2.428751        10
12   9.051767        11
13   7.339911        12
14   2.427481        13
15   7.479485        14
16   6.015409        15
17   1.180679        16
18   1.054605        17
19   4.065055        18
20   1.133214        19
21   3.870020        20
22   1.072305        21
23   1.122816        22
24   1.001281        23
25   4.225966        24
26  51.607520        25
27   2.908923        26
28   1.010488        27
29   1.007317        28
30   1.002699        29
31   2.728607        30
32        inf        31
33        inf        32
34        inf        33
35   1.001565        34
36   2.726269        35
37   1.000219        36


In [None]:

# Feature scaling
print(f'Applying feature scaling for X_train...')

# Scale numerical features
scaler = RobustScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
print(f'Scaled {len(numerical_cols)} numerical columns for X_train')


#Imputing missing values
print(f'Imputing missing values for X_train...')
imputer = ColumnTransformer(transformers=
                        [('numimp', SimpleImputer(strategy='median', add_indicator=True), numerical_cols),
                        ('catimp', SimpleImputer(strategy='most_frequent', add_indicator=True), categorical_cols)], 
                        remainder="passthrough").set_output(transform='pandas')
# num_imputer = SimpleImputer(strategy='mean', add_indicator=True)
X_train = imputer.fit_transform(X_train)
print(f'Imputed {len(numerical_cols)} numerical columns & {len(categorical_cols)} categorical columns for X_train')

# Verify no missing values remain
missing_after = X_train.isnull().sum().sum()
print(f'Missing values after imputation: {missing_after} for X_train')


categorical_cols_imp = X_train.select_dtypes(include=['object']).columns.tolist()
print(f'Categorical Columns: {len(categorical_cols_imp)}')

# finding cat columns that are binary
binary_cat_col_imp = []
for col in categorical_cols_imp:
    if X_train[col].nunique() == 2:
        binary_cat_col_imp.append(col)
print(f'Binary Categorical Columns: {len(binary_cat_col_imp)}')

# removing binary columns from list before OHE
for col in binary_cat_col_imp:
    categorical_cols_imp.remove(col)
print(f'Multiclass Categorical Columns: {len(categorical_cols_imp)}')
print('\n')
        

# label encoding binary categorical variables, converting to 0 or 1
le = LabelEncoder()
for col in binary_cat_col_imp:
    X_train[col] = le.fit_transform(X_train[col])

# onehot encoding multiclass category variables
ct = ColumnTransformer(transformers=[('ohe', OneHotEncoder(sparse_output=False), categorical_cols_imp)], 
                    remainder="passthrough").set_output(transform='pandas')

X_train = ct.fit_transform(X_train)



In [None]:
# ENCODING:


# DBSCAN
