#Imports

In [92]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import math

In [93]:
data = pd.read_csv('/content/Churn_Modelling.csv')

#Classes

##PreprocessingEvaluator

In [114]:
import numpy as np

class PreprocessingEvaluator:
    
    def __init__(self, data):
        self.data = data
        
    def check_for_nan(self):
        has_nan = self.data.isna().any().any()
        if has_nan:
            print("Dataset contains NaN values")
        else:
            print("Dataset does not contain NaN values")
            
    def check_for_infinity(self):
        has_inf = self.data.replace([np.inf, -np.inf], np.nan).isna().any().any()
        if has_inf:
            print("Dataset contains infinite values")
        else:
            print("Dataset does not contain infinite values")

    def check_for_outliers(self, threshold):
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            has_outliers = (np.abs(self.data[col]) > threshold).any()
            if has_outliers:
                print(f"{col} column contains outliers above the threshold of {threshold}.")

    def check_for_skewness(self):
        skewness = self.data.skew()
        if (skewness > 0.5).any():
            skewed_cols = skewness[skewness > 0.5].index.tolist()
            print("The following columns are right-skewed:", skewed_cols)
        elif (skewness < -0.5).any():
            skewed_cols = skewness[skewness < -0.5].index.tolist()
            print("The following columns are left-skewed:", skewed_cols)
        else:
            print("No skewness found in dataset.")

#Implementation

##Instantiate objects

In [134]:
# Initialize PreprocessingEvaluator with the dataset
evaluator = PreprocessingEvaluator(data)

##Evaluate Preprocessing

In [135]:
# check for NaN values
evaluator.check_for_nan()

Dataset does not contain NaN values


In [136]:
# check for infinite values
evaluator.check_for_infinity()

Dataset does not contain infinite values


In [137]:
data.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [138]:
evaluator.check_for_outliers(threshold=6)

RowNumber column contains outliers above the threshold of 6.
CustomerId column contains outliers above the threshold of 6.
CreditScore column contains outliers above the threshold of 6.
Age column contains outliers above the threshold of 6.
Tenure column contains outliers above the threshold of 6.
Balance column contains outliers above the threshold of 6.
EstimatedSalary column contains outliers above the threshold of 6.


In [139]:
evaluator.check_for_skewness()

The following columns are right-skewed: ['Age', 'NumOfProducts', 'Exited']


  skewness = self.data.skew()
