# Understanding the Dataset

In [1]:
import pandas as pd

dataframe = pd.read_csv("data.csv")

In [None]:
dataframe.shape

In [None]:
dataframe.describe()

In [None]:
dataframe.isnull().sum()

In [None]:
dataframe.duplicated().sum()

# Data Cleanup

In [2]:
# dataframe.dropna(subset=['Attrition'], inplace=True)

In [None]:
# dataframe.fillna(dataframe.mean(), inplace=True)

In [None]:
# dataframe.drop_duplicates(inplace=True)

# Creation of Input and Output Features

In [3]:
Features = dataframe.drop('Attrition', axis=1)
Labels = dataframe['Attrition']

# Conversion

In [4]:
numericColumns = Features.select_dtypes(include='int64').columns
Features = pd.get_dummies(Features).astype(int)

Features

# Scaling

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scalerMinMax = MinMaxScaler()
scalerStandard = StandardScaler()

scaler = 0  # 0 for MinMax, 1 for Standard
ScaledFeatures = Features

if scaler == 0:
    ScaledFeatures[numericColumns] = scalerMinMax.fit_transform(Features[numericColumns])
else:
    ScaledFeatures[numericColumns] = scalerStandard.fit_transform(Features[numericColumns])

ScaledFeatures

In [6]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Labels = encoder.fit_transform(Labels)

# Correlation

In [None]:
FeaturesDF = pd.DataFrame(ScaledFeatures, columns=Features.columns)
LabelsDF = pd.DataFrame(Labels, columns=['Attrition'])


In [None]:
LabelsSeries = LabelsDF['Attrition']
correlations = FeaturesDF.corrwith(LabelsSeries)
correlations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlationMatrix = FeaturesDF.corr()

plt.figure(figsize=(40, 32))
sns.heatmap(correlationMatrix, annot=True, fmt=".2f", cmap='cool')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import numpy as np
top20Features = correlations.abs().sort_values(ascending=False).head(20).index
top20Features

for feature in top20Features:
    class_0 = FeaturesDF.loc[LabelsSeries == 0]
    class_1 = FeaturesDF.loc[LabelsSeries == 1]

    plt.plot(class_0[feature], np.zeros_like(class_0[feature]), 'o', label='No')
    plt.plot(class_1[feature], np.zeros_like(class_1[feature]), 'o', label='Yes')

    plt.legend()
    plt.xlabel(feature)
    plt.title('1D Scatter Plot of {feature} by Numeric Classes')
    plt.show()

# Validation

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = ScaledFeatures[top20Features]
y = Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression classifier: {accuracy:.2f}")

Accuracy of Logistic Regression classifier: 0.87
