enrollee_id : Unique ID for candidate.
city: City code.
city_ development _index : Developement index of the city (scaled).
gender: Gender of candidate
relevent_experience: Relevant experience of candidate
enrolled_university: Type of University course enrolled if any
education_level: Education level of candidate
major_discipline :Education major discipline of candidate
experience: Candidate total experience in years
company_size: No of employees in current employer's company
company_type : Type of current employer
last_new_job: Difference in years between previous job and current job
training_hours: training hours completed
target: 0 – Not looking for job change, 1 – Looking for a job change

# Reading Data:

In [92]:
import pandas as pd

def read_file(file_path: str) -> pd.DataFrame:
    """
    Reads CSV file
    """
    return pd.read_csv(file_path)

job_change_df = read_file('job_change.csv')

# Handling Nulls:

In [93]:
def count_null_values(data_frame):
    """
    Counts number of nulls in each column of given DataFrame
    """
    return data_frame.isnull().sum()

count_null_values(job_change_df)

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [94]:
def count_unique_values(dataframe: pd.DataFrame, column_name: str) -> dict:
    """
    Counts number of occurrences of each, unique value in specified column of given DataFrame
    """
    unique_counts = dataframe[column_name].value_counts().to_dict()
    return unique_counts

In [95]:
import numpy as np

def fill_null_values_with_ratio(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Fills null values in a given DataFrame column based on the proportions of non-nulls.
    """
    proportions = df[column].value_counts(normalize = True)
    df[column].fillna(pd.Series(np.random.choice(proportions.index,
                                                 p = proportions.values,
                                                 size = len(df))),
                                                 inplace = True)
    return df

#### Gender null cleaning:

In [96]:
gender_counts = count_unique_values(job_change_df, 'gender')
print(gender_counts)

{'Male': 13221, 'Female': 1238, 'Other': 191}


In [113]:
job_change_df_cleaned_gender = fill_null_values_with_ratio(job_change_df, 'gender')
print(count_unique_values(job_change_df_cleaned_gender, 'gender'))

{'Male': 17277, 'Female': 1619, 'Other': 262}


#### University null cleaning:

In [98]:
uni_counts = count_unique_values(job_change_df_cleaned_gender, 'enrolled_university')
print(uni_counts)

{'no_enrollment': 13817, 'Full time course': 3757, 'Part time course': 1198}


In [114]:
job_change_df_cleaned_uni = fill_null_values_with_ratio(job_change_df_cleaned_gender, 'enrolled_university')

uni_counts_cleaned = count_unique_values(job_change_df_cleaned_uni, 'enrolled_university')
print(uni_counts_cleaned)

{'no_enrollment': 14102, 'Full time course': 3835, 'Part time course': 1221}


#### Education level null cleaning:

In [100]:
ed_lvl_counts = count_unique_values(job_change_df_cleaned_uni, 'education_level')
print(ed_lvl_counts)

{'Graduate': 11598, 'Masters': 4361, 'High School': 2017, 'Phd': 414, 'Primary School': 308}


In [115]:
job_change_df_cleaned_ed_lvl = fill_null_values_with_ratio(job_change_df_cleaned_uni, 'education_level')

ed_lvl_counts_cleaned = count_unique_values(job_change_df_cleaned_ed_lvl, 'education_level')
print(ed_lvl_counts_cleaned)

{'Graduate': 11883, 'Masters': 4470, 'High School': 2060, 'Phd': 430, 'Primary School': 315}


#### Major null cleaning:

In [102]:
major_counts = count_unique_values(job_change_df_cleaned_ed_lvl, 'major_discipline')
print(major_counts)

{'STEM': 14492, 'Humanities': 669, 'Other': 381, 'Business Degree': 327, 'Arts': 253, 'No Major': 223}


In [116]:
job_change_df_cleaned_major = fill_null_values_with_ratio(job_change_df_cleaned_ed_lvl, 'major_discipline')

major_counts_cleaned = count_unique_values(job_change_df_cleaned_major, 'major_discipline')
print(major_counts_cleaned)

{'STEM': 17006, 'Humanities': 790, 'Other': 443, 'Business Degree': 372, 'Arts': 291, 'No Major': 256}


#### Experience null cleaning:

In [104]:
exp_counts = count_unique_values(job_change_df_cleaned_major, 'experience')
print(exp_counts)

{'>20': 3286, '5': 1430, '4': 1403, '3': 1354, '6': 1216, '2': 1127, '7': 1028, '10': 985, '9': 980, '8': 802, '15': 686, '11': 664, '14': 586, '1': 549, '<1': 522, '16': 508, '12': 494, '13': 399, '17': 342, '19': 304, '18': 280, '20': 148}


In [117]:
job_change_df_cleaned_exp = fill_null_values_with_ratio(job_change_df_cleaned_major, 'experience')

exp_counts_cleaned = count_unique_values(job_change_df_cleaned_exp, 'experience')
print(exp_counts_cleaned)

{'>20': 3295, '5': 1436, '4': 1410, '3': 1358, '6': 1219, '2': 1133, '7': 1030, '10': 988, '9': 981, '8': 804, '15': 688, '11': 668, '14': 589, '1': 552, '<1': 523, '16': 509, '12': 496, '13': 402, '17': 345, '19': 304, '18': 280, '20': 148}


#### Company size null cleaning:

In [106]:
compsize_counts = count_unique_values(job_change_df_cleaned_exp, 'company_size')
print(compsize_counts)

{'50-99': 3083, '100-500': 2571, '10000+': 2019, '10/49': 1471, '1000-4999': 1328, '<10': 1308, '500-999': 877, '5000-9999': 563}


In [118]:
job_change_df_cleaned_compsize = fill_null_values_with_ratio(job_change_df_cleaned_exp, 'company_size')

compsize_counts_cleaned = count_unique_values(job_change_df_cleaned_compsize, 'company_size')
print(compsize_counts_cleaned)

{'50-99': 4514, '100-500': 3681, '10000+': 2935, '10/49': 2130, '1000-4999': 1947, '<10': 1874, '500-999': 1252, '5000-9999': 825}


#### Company type null cleaning:

In [108]:
comptype_counts = count_unique_values(job_change_df_cleaned_compsize, 'company_type')
print(comptype_counts)

{'Pvt Ltd': 9817, 'Funded Startup': 1001, 'Public Sector': 955, 'Early Stage Startup': 603, 'NGO': 521, 'Other': 121}


In [119]:
job_change_df_cleaned_comptype = fill_null_values_with_ratio(job_change_df_cleaned_compsize, 'company_type')

comptype_counts_cleaned = count_unique_values(job_change_df_cleaned_comptype, 'company_type')
print(comptype_counts_cleaned)

{'Pvt Ltd': 14444, 'Funded Startup': 1497, 'Public Sector': 1427, 'Early Stage Startup': 851, 'NGO': 765, 'Other': 174}


#### Last new job null cleaning:

In [110]:
last_job_counts = count_unique_values(job_change_df_cleaned_comptype, 'last_new_job')
print(last_job_counts)

{'1': 8040, '>4': 3290, '2': 2900, 'never': 2452, '4': 1029, '3': 1024}


In [120]:
job_change_df_cleaned_nulls = fill_null_values_with_ratio(job_change_df_cleaned_compsize, 'last_new_job')

nulls_counts_cleaned = count_unique_values(job_change_df_cleaned_nulls, 'last_new_job')
print(nulls_counts_cleaned)

{'1': 8204, '>4': 3368, '2': 2970, 'never': 2514, '3': 1053, '4': 1049}


In [121]:
count_null_values(job_change_df_cleaned_nulls)

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

# Encoding: