In [None]:
import pandas as pd
import plotly.express as px

# EDA Small

In [None]:
small_train_df = pd.read_csv('/content/small_train_df.csv')
small_test_df = pd.read_csv('/content/small_test_df.csv')

In [None]:
small_combined_df = pd.concat([small_train_df, small_test_df], axis=0)

In [None]:
import plotly.express as px

for column in small_combined_df.columns:
  if column != 'Attrition':  # Exclude the target attribute
    count = small_combined_df.groupby([column, 'Attrition']).size().reset_index(name='Count')
    fig = px.bar(count, x=column, y='Count', color='Attrition', title=column, width=500, height=400,
                 color_discrete_map={'No': 'skyblue', 'Yes': 'Purple'})
    fig.show()

In [None]:
for column in small_combined_df.columns:
  if column != 'Attrition' and pd.api.types.is_numeric_dtype(small_combined_df[column]):
    fig = px.box(small_combined_df, x='Attrition', y=column, color='Attrition', height=300, width=400,
                 color_discrete_map={'No': 'skyblue', 'Yes': 'Purple'})
    fig.show()

# Preprocessing Small

In [None]:
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
  # Create a copy of the dataframe to avoid modifying the original
  df_encoded = df.copy()

  # Identify columns with object (categorical) data type
  object_columns = df.select_dtypes(include=['object']).columns

  # Exclude 'Attrition' from one-hot encoding
  if 'Attrition' in object_columns:
    object_columns = object_columns.drop('Attrition')

  # Apply one-hot encoding to the selected columns
  if len(object_columns) > 0:
    enc = OneHotEncoder(handle_unknown='ignore')
    encoded_features = enc.fit_transform(df[object_columns]).toarray()
    feature_names = enc.get_feature_names_out(object_columns)
    encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

    # Reset index of both dataframes to avoid potential conflicts
    df_encoded = df_encoded.reset_index(drop=True) # Reset index and drop the old index
    encoded_df = encoded_df.reset_index(drop=True) # Reset index and drop the old index

    # Drop the original object columns and concatenate the encoded columns
    df_encoded = df_encoded.drop(object_columns, axis=1)
    df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

  df_encoded['Attrition'] = df_encoded['Attrition'].map({'No': 0, 'Yes': 1})
  df_encoded = df_encoded.astype(float)
  return df_encoded

In [None]:
small_train_df_encoded = preprocess_data(small_train_df)
small_test_df_encoded = preprocess_data(small_test_df)

In [None]:
# prompt: what are the different columns for small_train_df_encoded and small_test_df_encoded

print("Columns in small_train_df_encoded:")
print(small_train_df_encoded.columns.tolist())

print("\nColumns in small_test_df_encoded:")
print(small_test_df_encoded.columns.tolist())


Columns in small_train_df_encoded:
['Age', 'Attrition', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'Shift', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'Department_Cardiology', 'Department_Maternity', 'Department_Neurology', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male', 'JobRole_Administrative', 'JobRole_Nurse', 'JobRole_Other', 'JobRole_Therapist', 'MaritalStatus_Divorced

In [None]:
# prompt: add a column in small_train_df_encoded named 'JobRole_Admin' beside 'Gender_Male' and put the values as 0.0

small_train_df_encoded.insert(small_train_df_encoded.columns.get_loc('Gender_Male') + 1, 'JobRole_Admin', 0.0)


In [None]:
small_train_df_encoded.to_csv('small_train_df_encoded.csv', index=False)
small_test_df_encoded.to_csv('small_test_df_encoded.csv', index=False)

# EDA Large

In [None]:
large_train_df = pd.read_csv('/content/large_train_df.csv')
large_test_df = pd.read_csv('/content/large_test_df.csv')

In [None]:
large_combined_df = pd.concat([large_train_df, large_test_df], axis=0)

In [None]:
for column in large_combined_df.columns:
  if column != 'Attrition':  # Exclude the target attribute
    #fig = px.histogram(large_combined_df, x=column, color='Attrition', barmode='group', height=300, width=400)
    count = large_combined_df.groupby([column, 'Attrition']).size().reset_index(name='Count')
    fig = px.bar(count, x=column, y='Count', color='Attrition', title=column, width=500, height=400)
    fig.show()

In [None]:
for column in large_combined_df.columns:
  if column != 'Attrition' and pd.api.types.is_numeric_dtype(large_combined_df[column]):
    fig = px.box(large_combined_df, x='Attrition', y=column, color='Attrition', height=300, width=400)
    fig.show()

# Preprocessing Large

## Work-Life Balance

In [None]:
large_train_df['WorkLifeBalance'].unique()

array(['Good', 'Poor', 'Fair', 'Excellent'], dtype=object)

In [None]:
small_train_df['WorkLifeBalance'].unique()

array([1, 2, 3, 4])

In [None]:
large_train_df['WorkLifeBalance'] = large_train_df['WorkLifeBalance'].map({'Poor': 1,
                                                                               'Fair': 2,
                                                                               'Good': 3,
                                                                               'Excellent': 4})

In [None]:
large_test_df['WorkLifeBalance'] = large_test_df['WorkLifeBalance'].map({'Poor': 1,
                                                                               'Fair': 2,
                                                                               'Good': 3,
                                                                               'Excellent': 4})

## Job Satisfaction

In [None]:
large_train_df['JobSatisfaction'].unique()

array(['High', 'Medium', 'Very High', 'Low'], dtype=object)

In [None]:
small_train_df['JobSatisfaction'].unique()

array([1, 4, 2, 3])

In [None]:
large_train_df['JobSatisfaction'] = large_train_df['JobSatisfaction'].map({'Low': 1,
                                                                             'Medium': 2,
                                                                             'High': 3,
                                                                             'Very High': 4})

In [None]:
large_test_df['JobSatisfaction'] = large_test_df['JobSatisfaction'].map({'Low': 1,
                                                                             'Medium': 2,
                                                                             'High': 3,
                                                                             'Very High': 4})

## Performance Rating

In [None]:
large_train_df['PerformanceRating'].unique()

array(['Low', 'Average', 'High', 'Below Average'], dtype=object)

In [None]:
small_train_df['PerformanceRating'].unique()

array([3, 4])

In [None]:
large_train_df['PerformanceRating'] = large_train_df['PerformanceRating'].map({'Low': 3,
                                                                             'Below Average': 3,
                                                                             'Average': 4,
                                                                             'High': 4})

In [None]:
large_test_df['PerformanceRating'] = large_test_df['PerformanceRating'].map({'Low': 3,
                                                                             'Below Average': 3,
                                                                             'Average': 4,
                                                                             'High': 4})

## Education Level

In [None]:
large_train_df['Education'].unique()

array(['Bachelor’s Degree', 'PhD', 'Master’s Degree', 'Associate Degree',
       'High School'], dtype=object)

In [None]:
small_train_df['Education'].unique()

array([3, 4, 2, 1, 5])

In [None]:
large_train_df['Education'] = large_train_df['Education'].map({'High School': 1,
                                                                           'Associate Degree': 2,
                                                                           'Bachelor’s Degree': 3,
                                                                           'Master’s Degree': 4,
                                                                           'PhD':5})

In [None]:
large_test_df['Education'] = large_test_df['Education'].map({'High School': 1,
                                                                           'Associate Degree': 2,
                                                                           'Bachelor’s Degree': 3,
                                                                           'Master’s Degree': 4,
                                                                           'PhD':5})

## Job Level

In [None]:
large_train_df['JobLevel'].unique()

array(['Mid', 'Entry', 'Senior'], dtype=object)

In [None]:
small_train_df['JobLevel'].unique()

array([2, 1, 5, 3, 4])

In [None]:
large_train_df['JobLevel'] = large_train_df['JobLevel'].map({'Entry': 1,
                                                               'Mid': 2,
                                                               'Senior': 3})
small_train_df['JobLevel'] = small_train_df['JobLevel'].map({1 : 1,
                                                             2 : 1,
                                                             3 : 2,
                                                             4 : 3,
                                                             5 : 3})

In [None]:
large_test_df['JobLevel'] = large_test_df['JobLevel'].map({'Entry': 1,
                                                               'Mid': 2,
                                                               'Senior': 3})
small_test_df['JobLevel'] = small_test_df['JobLevel'].map({1 : 1,
                                                             2 : 1,
                                                             3 : 2,
                                                             4 : 3,
                                                             5 : 3})

## Company Reputation

In [None]:
large_train_df['CompanyReputation'].unique()

array(['Poor', 'Good', 'Fair', 'Excellent'], dtype=object)

In [None]:
large_train_df['CompanyReputation'] = large_train_df['CompanyReputation'].map({'Poor': 1,
                                                                                 'Fair': 2,
                                                                                 'Good': 3,
                                                                                 'Excellent': 4})

In [None]:
large_test_df['CompanyReputation'] = large_test_df['CompanyReputation'].map({'Poor': 1,
                                                                                 'Fair': 2,
                                                                                 'Good': 3,
                                                                                 'Excellent': 4})

## Employee Recognition

In [None]:
large_train_df['EmployeeRecognition'].unique()

array(['Low', 'Medium', 'High', 'Very High'], dtype=object)

In [None]:
large_train_df['EmployeeRecognition'] = large_train_df['EmployeeRecognition'].map({'Low': 1,
                                                                                     'Medium': 2,
                                                                                     'High': 3,
                                                                                     'Very High': 4})

In [None]:
large_test_df['EmployeeRecognition'] = large_test_df['EmployeeRecognition'].map({'Low': 1,
                                                                                     'Medium': 2,
                                                                                     'High': 3,
                                                                                     'Very High': 4})

In [None]:
large_train_df_encoded = preprocess_data(large_train_df)
large_test_df_encoded = preprocess_data(large_test_df)

In [None]:
print("Columns in large_train_df_encoded:")
print(large_train_df_encoded.columns.tolist())

print("\nColumns in large_test_df_encoded:")
print(large_test_df_encoded.columns.tolist())

Columns in large_train_df_encoded:
['Age', 'YearsAtCompany', 'MonthlyIncome', 'WorkLifeBalance', 'JobSatisfaction', 'PerformanceRating', 'NumberofPromotions', 'DistanceFromHome', 'Education', 'NumberofDependents', 'JobLevel', 'CompanyTenure', 'CompanyReputation', 'EmployeeRecognition', 'Attrition', 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare', 'OverTime_No', 'OverTime_Yes', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'CompanySize_Large', 'CompanySize_Medium', 'CompanySize_Small', 'RemoteWork_No', 'RemoteWork_Yes', 'LeadershipOpportunities_No', 'LeadershipOpportunities_Yes', 'InnovationOpportunities_No', 'InnovationOpportunities_Yes']

Columns in large_test_df_encoded:
['Age', 'YearsAtCompany', 'MonthlyIncome', 'WorkLifeBalance', 'JobSatisfaction', 'PerformanceRating', 'NumberofPromotions', 'DistanceFromHome', 'Education', 'NumberofDependents', 'JobLevel', 'CompanyTenure', 'CompanyReputation', 'EmployeeRecognition', 'Attrition', 'Gender_Female', 'G

In [None]:
large_train_df_encoded.to_csv('large_train_df_encoded.csv', index=False)
large_test_df_encoded.to_csv('large_test_df_encoded.csv', index=False)