In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))  

from src.data_loading_1 import data_loading
from src.data_cleaning_2 import data_cleaning
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = data_loading()
df = data_cleaning(df)
df.head()

Data loaded successfully.....
Data is cleaned...


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,619,France,Female,42.0,2,101348.88,0.0,1,Yes,Yes,1
1,619,France,Female,42.0,2,101348.88,0.0,1,Yes,Yes,1
2,608,Spain,Female,41.0,1,112542.58,83807.86,1,Yes,Yes,0
3,502,France,Female,42.0,8,113931.57,159660.8,3,No,No,1
4,699,France,Female,39.0,1,93826.63,0.0,2,No,No,0


In [3]:
numerical = []
categorical = []
for col in df.columns:
    if df[col].dtype == 'object':
        categorical.append(col)
    else:
        numerical.append(col)

In [4]:
if df.duplicated().sum() > 0:
    print(f"DataFrame has {df.duplicated().sum()} duplicate rows")
    df = df.drop_duplicates()
    print(f"DataFrame has {df.duplicated().sum()} duplicate rows after dropping duplicates")

DataFrame has 4 duplicate rows
DataFrame has 0 duplicate rows after dropping duplicates


In [5]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            print(f"Column {col} has {df[col].isnull().sum()} missing values")
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            print(f"Column {col} has {df[col].isnull().sum()} missing values")
            df[col] = df[col].fillna(round(df[col].mean()))
            print(f"Column {col} has been filled with mean value: {round(df[col].mean())}")

Column Age has 3 missing values
Column Age has been filled with mean value: 39
Column EstimatedSalary has 3 missing values
Column EstimatedSalary has been filled with mean value: 100092


In [6]:
numerical

['CreditScore',
 'Age',
 'Tenure',
 'EstimatedSalary',
 'Balance',
 'NumOfProducts',
 'Exited']

In [7]:
def outliers(df):
    for col in df.columns:
        if col in numerical[:-1]:
            if col != 'Age':
                q1 = df[col].quantile(0.25)
                q3 = df[col].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
                print(f"Column {col} has {len(outliers)} outliers")
                if outliers.shape[0] > 0:
                    df[col] = df[col].clip(lower_bound, upper_bound)
                    print(f"Outliers in column {col} have been clipped to the lower and upper bounds")
    return df

df_cleaned = outliers(df)

Column CreditScore has 15 outliers
Outliers in column CreditScore have been clipped to the lower and upper bounds
Column Tenure has 0 outliers
Column EstimatedSalary has 0 outliers
Column Balance has 0 outliers
Column NumOfProducts has 60 outliers
Outliers in column NumOfProducts have been clipped to the lower and upper bounds


In [8]:
X, y = df_cleaned.drop(columns=['Exited']), df_cleaned['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
categorical

['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

In [10]:
import joblib

X_train_dummies = pd.get_dummies(X_train, columns=categorical, drop_first=True)
X_test_dummies = pd.get_dummies(X_test, columns=categorical, drop_first=True)

X_test_dummies = X_test_dummies.reindex(columns=X_train_dummies.columns, fill_value=0)

joblib.dump(X_train_dummies.columns.tolist(), '/home/user/churniq/artifacts/encoded_columns.pkl')

['/home/user/churniq/artifacts/encoded_columns.pkl']

In [11]:
from sklearn.preprocessing import StandardScaler

X_train_std = X_train_dummies.copy()
X_test_std = X_test_dummies.copy()

std = StandardScaler()

X_train_std = std.fit_transform(X_train_std)
X_test_std = std.transform(X_test_std)

joblib.dump(std, '/home/user/churniq/artifacts/scaler_std.pkl')

['/home/user/churniq/artifacts/scaler_std.pkl']

In [18]:
a = (df['Exited'].value_counts()*100/df.shape[0])[1]

In [25]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

if (df['Exited'].value_counts()*100/df.shape[0])[1] != (df['Exited'].value_counts()*100/df.shape[0])[0]:
    smote = SMOTETomek(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_std, y_train)

In [26]:
def data_preprocessing(df):
    numerical = []
    categorical = []
    for col in df.columns:
        if df[col].dtype == 'object':
            categorical.append(col)
        else:
            numerical.append(col)
    print("Checking for duplicates...")
    if df.duplicated().sum() > 0:
        print(f"DataFrame has {df.duplicated().sum()} duplicate rows")
        df = df.drop_duplicates()
        print(f"DataFrame has {df.duplicated().sum()} duplicate rows after dropping duplicates")

    print('Checking for missing values')
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':
                print(f"Column {col} has {df[col].isnull().sum()} missing values")
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                print(f"Column {col} has {df[col].isnull().sum()} missing values")
                df[col] = df[col].fillna(round(df[col].mean()))
                print(f"Column {col} has been filled with mean value: {round(df[col].mean())}")

    def outliers(df):
        for col in df.columns:
            if col in numerical[:-1]:
                if col != 'Age':
                    q1 = df[col].quantile(0.25)
                    q3 = df[col].quantile(0.75)
                    iqr = q3 - q1
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
                    print(f"Column {col} has {len(outliers)} outliers")
                    if outliers.shape[0] > 0:
                        df[col] = df[col].clip(lower_bound, upper_bound)
                        print(f"Outliers in column {col} have been clipped to the lower and upper bounds")
        return df

    df_cleaned = outliers(df)
    
    X, y = df_cleaned.drop(columns=['Exited']), df_cleaned['Exited']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train_dummies = pd.get_dummies(X_train, columns=categorical, drop_first=True)
    X_test_dummies = pd.get_dummies(X_test, columns=categorical, drop_first=True)

    X_test_dummies = X_test_dummies.reindex(columns=X_train_dummies.columns, fill_value=0)

    joblib.dump(X_train_dummies.columns.tolist(), '/home/user/churniq/artifacts/encoded_columns.pkl')

    X_train_std = X_train_dummies.copy()
    X_test_std = X_test_dummies.copy()

    std = StandardScaler()

    X_train_std = std.fit_transform(X_train_std)
    X_test_std = std.transform(X_test_std)

    joblib.dump(std, '/home/user/churniq/artifacts/scaler_std.pkl')
    
    if (df['Exited'].value_counts()*100/df.shape[0])[1] != (df['Exited'].value_counts()*100/df.shape[0])[0]:
        smote = SMOTETomek(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_std, y_train)
    
    return X_train_resampled, X_test_std, y_train_resampled, y_test

In [27]:
X_train,X_test,y_train,y_test = data_preprocessing(df)

Checking for duplicates...
Checking for missing values
Column CreditScore has 0 outliers
Column Tenure has 0 outliers
Column EstimatedSalary has 0 outliers
Column Balance has 0 outliers
Column NumOfProducts has 0 outliers
