# Loading the datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
small_df = pd.read_csv('/content/watson_healthcare_modified.csv')
large_train_df = pd.read_csv('/content/train.csv')
large_test_df = pd.read_csv('/content/test.csv')

In [None]:
large_train_df = large_train_df[large_train_df['Job Role'] == 'Healthcare']
large_test_df = large_test_df[large_test_df['Job Role'] == 'Healthcare']

In [None]:
small_df.head(1)

Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1313919,41,No,Travel_Rarely,1102,Cardiology,1,2,Life Sciences,1,...,1,80,0,8,0,1,6,4,0,5


In [None]:
large_train_df.shape, large_test_df.shape

((13642, 24), (3432, 24))

In [None]:
small_df.shape

(1676, 35)

In [None]:
small_df.drop('EmployeeID', axis=1, inplace=True)
large_train_df.drop('Employee ID', axis=1, inplace=True)
large_test_df.drop('Employee ID', axis=1, inplace=True)

In this step, it is made sure that 70% of Attrition 'Yes' is in Train, and remaining 30% of Attrition 'Yes' is in Test. Same for Attrition 'No'

In [None]:
attrition_yes_df = small_df[small_df['Attrition'] == 'Yes']
attrition_no_df = small_df[small_df['Attrition'] == 'No']

In [None]:
# Calculate the number of rows for training and testing
train_yes_rows = int(len(attrition_yes_df) * 0.7)
train_no_rows = int(len(attrition_no_df) * 0.7)

# Create training and testing dataframes
small_train_df = pd.concat([attrition_yes_df.iloc[:train_yes_rows], attrition_no_df.iloc[:train_no_rows]])
small_test_df = pd.concat([attrition_yes_df.iloc[train_yes_rows:], attrition_no_df.iloc[train_no_rows:]])

# Shuffle the dataframes
small_train_df = small_train_df.sample(frac=1, random_state=42)
small_test_df = small_test_df.sample(frac=1, random_state=42)

# Print the shapes of the new dataframes
print("Small train df shape:", small_train_df.shape)
print("Small test df shape:", small_test_df.shape)

Small train df shape: (1172, 34)
Small test df shape: (504, 34)


In [None]:
small_train_df.Attrition.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,1033
Yes,139


In [None]:
small_test_df.Attrition.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,444
Yes,60


In [None]:
small_train_df.Attrition.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Attrition,Unnamed: 1_level_1
No,0.881399
Yes,0.118601


In [None]:
small_test_df.Attrition.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Attrition,Unnamed: 1_level_1
No,0.880952
Yes,0.119048


In [None]:
large_train_df = large_train_df.rename(columns={
    'Education Level': 'Education',
    'Distance from Home': 'DistanceFromHome',
    'Overtime': 'OverTime',
    'Work-Life Balance': 'WorkLifeBalance',
    'Years at Company': 'YearsAtCompany'
    })

In [None]:
large_test_df = large_test_df.rename(columns={
    'Education Level': 'Education',
    'Distance from Home': 'DistanceFromHome',
    'Overtime': 'OverTime',
    'Work-Life Balance': 'WorkLifeBalance',
    'Years at Company': 'YearsAtCompany'
    })

In [None]:
new_column_names = {col: col.replace(' ', '') for col in large_train_df.columns}
large_train_df = large_train_df.rename(columns=new_column_names)

new_column_names = {col: col.replace(' ', '') for col in large_test_df.columns}
large_test_df = large_test_df.rename(columns=new_column_names)

In [None]:
large_train_df.columns

Index(['Age', 'Gender', 'YearsAtCompany', 'JobRole', 'MonthlyIncome',
       'WorkLifeBalance', 'JobSatisfaction', 'PerformanceRating',
       'NumberofPromotions', 'OverTime', 'DistanceFromHome', 'Education',
       'MaritalStatus', 'NumberofDependents', 'JobLevel', 'CompanySize',
       'CompanyTenure', 'RemoteWork', 'LeadershipOpportunities',
       'InnovationOpportunities', 'CompanyReputation', 'EmployeeRecognition',
       'Attrition'],
      dtype='object')

In [None]:
common_cols = ['Age', 'Attrition', 'Gender', 'JobLevel', 'MaritalStatus', 'Education', 'DistanceFromHome', 'OverTime', 'WorkLifeBalance', 'MonthlyIncome', 'JobRole', 'JobSatisfaction','PerformanceRating','YearsAtCompany']
len(common_cols)

14

In [None]:
small_train_df_common = small_train_df[list(common_cols)]
small_test_df_common = small_test_df[list(common_cols)]
large_train_df_common = large_train_df[list(common_cols)]
large_test_df_common = large_test_df[list(common_cols)]

In [None]:
large_train_df["Attrition"] = large_train_df["Attrition"].replace({'Stayed': 'No', 'Left': 'Yes'})
large_test_df["Attrition"] = large_test_df["Attrition"].replace({'Stayed': 'No', 'Left': 'Yes'})

In [None]:
small_train_df.to_csv('small_train_df.csv', index=False)
small_test_df.to_csv('small_test_df.csv', index=False)
large_train_df.to_csv('large_train_df.csv', index=False)
large_test_df.to_csv('large_test_df.csv', index=False)

In [None]:
common_cols = set(small_train_df.columns) & set(small_test_df.columns) & set(large_train_df_common.columns) & set(large_test_df_common.columns)
print(list(common_cols))
print(len(common_cols))

['Gender', 'DistanceFromHome', 'JobSatisfaction', 'OverTime', 'WorkLifeBalance', 'Age', 'Education', 'PerformanceRating', 'Attrition', 'YearsAtCompany', 'JobRole', 'JobLevel', 'MaritalStatus', 'MonthlyIncome']
14


In [None]:
small_train_df_common.to_csv('small_train_df_common.csv', index=False)
small_test_df_common.to_csv('small_test_df_common.csv', index=False)
large_train_df_common.to_csv('large_train_df_common.csv', index=False)
large_test_df_common.to_csv('large_test_df_common.csv', index=False)