# Reading Data:

In [1]:
import pandas as pd

def read_file(file_path: str) -> pd.DataFrame:
    """
    Reads a CSV file through a given file path.
    """
    return pd.read_csv(file_path)

job_change_df = read_file('job_change.csv')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Handling Nulls:

In [2]:
def count_null_values(df: pd.DataFrame) -> pd.Series:
    """
    Counts the number of nulls in each column of a given DataFrame.
    """
    return df.isnull().sum()

count_null_values(job_change_df)

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [3]:
def count_unique_values(df: pd.DataFrame, column_name: str) -> dict:
    """
    Counts the number of occurrences of each, unique value in a specified column of a given DataFrame.
    """
    unique_counts = df[column_name].value_counts().to_dict()
    return unique_counts

In [4]:
import numpy as np

def fill_null_values_with_ratio(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Fills the null values in a specified column of a given DataFrame based on the proportions of non-nulls.
    """
    proportions = df[column].value_counts(normalize = True)
    df[column].fillna(pd.Series(np.random.choice(proportions.index,
                                                 p = proportions.values,
                                                 size = len(df))),
                                                 inplace = True)
    return df

#### Gender null cleaning:

In [5]:
gender_counts = count_unique_values(job_change_df, 'gender')
print(gender_counts)

{'Male': 13221, 'Female': 1238, 'Other': 191}


In [6]:
job_change_df_cleaned_gender = fill_null_values_with_ratio(job_change_df, 'gender')
print(count_unique_values(job_change_df_cleaned_gender, 'gender'))

{'Male': 17258, 'Female': 1655, 'Other': 245}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### University null cleaning:

In [7]:
uni_counts = count_unique_values(job_change_df_cleaned_gender, 'enrolled_university')
print(uni_counts)

{'no_enrollment': 13817, 'Full time course': 3757, 'Part time course': 1198}


In [8]:
job_change_df_cleaned_uni = fill_null_values_with_ratio(job_change_df_cleaned_gender, 'enrolled_university')

uni_counts_cleaned = count_unique_values(job_change_df_cleaned_uni, 'enrolled_university')
print(uni_counts_cleaned)

{'no_enrollment': 14099, 'Full time course': 3831, 'Part time course': 1228}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Education level null cleaning:

In [9]:
ed_lvl_counts = count_unique_values(job_change_df_cleaned_uni, 'education_level')
print(ed_lvl_counts)

{'Graduate': 11598, 'Masters': 4361, 'High School': 2017, 'Phd': 414, 'Primary School': 308}


In [10]:
job_change_df_cleaned_ed_lvl = fill_null_values_with_ratio(job_change_df_cleaned_uni, 'education_level')

ed_lvl_counts_cleaned = count_unique_values(job_change_df_cleaned_ed_lvl, 'education_level')
print(ed_lvl_counts_cleaned)

{'Graduate': 11868, 'Masters': 4473, 'High School': 2067, 'Phd': 431, 'Primary School': 319}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Major null cleaning:

In [11]:
major_counts = count_unique_values(job_change_df_cleaned_ed_lvl, 'major_discipline')
print(major_counts)

{'STEM': 14492, 'Humanities': 669, 'Other': 381, 'Business Degree': 327, 'Arts': 253, 'No Major': 223}


In [12]:
job_change_df_cleaned_major = fill_null_values_with_ratio(job_change_df_cleaned_ed_lvl, 'major_discipline')

major_counts_cleaned = count_unique_values(job_change_df_cleaned_major, 'major_discipline')
print(major_counts_cleaned)

{'STEM': 16998, 'Humanities': 774, 'Other': 449, 'Business Degree': 385, 'Arts': 295, 'No Major': 257}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Experience null cleaning:

In [13]:
exp_counts = count_unique_values(job_change_df_cleaned_major, 'experience')
print(exp_counts)

{'>20': 3286, '5': 1430, '4': 1403, '3': 1354, '6': 1216, '2': 1127, '7': 1028, '10': 985, '9': 980, '8': 802, '15': 686, '11': 664, '14': 586, '1': 549, '<1': 522, '16': 508, '12': 494, '13': 399, '17': 342, '19': 304, '18': 280, '20': 148}


In [14]:
job_change_df_cleaned_exp = fill_null_values_with_ratio(job_change_df_cleaned_major, 'experience')

exp_counts_cleaned = count_unique_values(job_change_df_cleaned_exp, 'experience')
print(exp_counts_cleaned)

{'>20': 3300, '5': 1434, '4': 1405, '3': 1363, '6': 1217, '2': 1133, '7': 1030, '10': 989, '9': 981, '8': 805, '15': 687, '11': 665, '14': 588, '1': 552, '<1': 522, '16': 511, '12': 496, '13': 401, '17': 345, '19': 304, '18': 282, '20': 148}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Company size null cleaning:

In [15]:
compsize_counts = count_unique_values(job_change_df_cleaned_exp, 'company_size')
print(compsize_counts)

{'50-99': 3083, '100-500': 2571, '10000+': 2019, '10/49': 1471, '1000-4999': 1328, '<10': 1308, '500-999': 877, '5000-9999': 563}


In [16]:
job_change_df_cleaned_compsize = fill_null_values_with_ratio(job_change_df_cleaned_exp, 'company_size')

compsize_counts_cleaned = count_unique_values(job_change_df_cleaned_compsize, 'company_size')
print(compsize_counts_cleaned)

{'50-99': 4450, '100-500': 3723, '10000+': 2921, '10/49': 2103, '1000-4999': 1951, '<10': 1939, '500-999': 1243, '5000-9999': 828}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Company type null cleaning:

In [17]:
comptype_counts = count_unique_values(job_change_df_cleaned_compsize, 'company_type')
print(comptype_counts)

{'Pvt Ltd': 9817, 'Funded Startup': 1001, 'Public Sector': 955, 'Early Stage Startup': 603, 'NGO': 521, 'Other': 121}


In [18]:
job_change_df_cleaned_comptype = fill_null_values_with_ratio(job_change_df_cleaned_compsize, 'company_type')

comptype_counts_cleaned = count_unique_values(job_change_df_cleaned_comptype, 'company_type')
print(comptype_counts_cleaned)

{'Pvt Ltd': 14485, 'Funded Startup': 1464, 'Public Sector': 1404, 'Early Stage Startup': 879, 'NGO': 752, 'Other': 174}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


#### Last new job null cleaning:

In [19]:
last_job_counts = count_unique_values(job_change_df_cleaned_comptype, 'last_new_job')
print(last_job_counts)

{'1': 8040, '>4': 3290, '2': 2900, 'never': 2452, '4': 1029, '3': 1024}


In [20]:
job_change_df_cleaned_nulls = fill_null_values_with_ratio(job_change_df_cleaned_compsize, 'last_new_job')

nulls_counts_cleaned = count_unique_values(job_change_df_cleaned_nulls, 'last_new_job')
print(nulls_counts_cleaned)

{'1': 8214, '>4': 3367, '2': 2955, 'never': 2511, '4': 1062, '3': 1049}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(pd.Series(np.random.choice(proportions.index,


In [21]:
count_null_values(job_change_df_cleaned_nulls)

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

# Encoding:

In [22]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Label encodes specified column of a given DataFrame.
    """
    encoder = LabelEncoder()
    encoded_column = encoder.fit_transform(df[column_name])
    df[column_name] = encoded_column
    return df

In [23]:
def ordinal_map_encode(df: pd.DataFrame, column_name: str, ordinal_mapping: dict[str, int]) -> pd.DataFrame:
    """
    Ordinally maps specified column of a given DataFrame based on the provided mapping.
    """
    df[column_name] = df[column_name].map(ordinal_mapping)
    return df

"enrolee_id" (or the unique "tag" of each data row) will not be encoded, given it is, at best, a confounding, feature for my models, and at worst a perfect predictor of the outcome.

"city" will be label encoded, as it is categorical—neccesitating conversion to integer values—but not ordinal.

In [24]:
job_change_df_city_encoded = label_encode(job_change_df_cleaned_nulls, 'city')

"city_development_index" is already integered AND normalized between 0 and 1, hooray!

gender will be label encoded, as it is categorical—neccesitating conversion to integer values—but not ordinal.

In [25]:
job_change_df_gender_encoded = label_encode(job_change_df_city_encoded, 'gender')

"relevant_experience" will be ordinally encoded, with "No relevent experience" mapped to 0 and "Has relevent experience" mapped to 1, given the latter has a higher status than the former.

In [26]:
relexp_mapping = {'No relevent experience': 0, 'Has relevent experience': 1}

job_change_df_relexp_encoded = ordinal_map_encode(job_change_df_gender_encoded, 'relevent_experience', relexp_mapping)

"enrolled_university" will be label encoded, because even if an order pops out between "no enrollment", "part time course", and "full time course" at face value, all these options are equally valid living situations; there is no universal status of one over any other.

In [27]:
job_change_df_uni_encoded = label_encode(job_change_df_relexp_encoded, 'enrolled_university')

"education_level", on the other hand, will be ordinally mapped, given a "PhD" represents more time and bears more weight than graduation from "primary school", and the preceding grade levels.

In [28]:
degree_mapping = {'Primary School': 0, 'High School': 1, 'Graduate': 2, 'Masters': 3, 'Phd': 4}

job_change_df_degree_encoded = ordinal_map_encode(job_change_df_uni_encoded, 'education_level', degree_mapping)

"major_discipline" will be label encoded, as it is categorical—neccesitating conversion to integer values—but not ordinal.

In [29]:
job_change_df_major_encoded = label_encode(job_change_df_degree_encoded, 'major_discipline')

"experience" (measured in years worked at the job) will be ordinally mapped, given the natural order of time places the more experienced above the less experienced, on a scale.

In [30]:
timeexp_mapping = {'<1': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 1,
                   '12': 12, '13': 13, '14': 14, '15': 15,'16': 16,'17': 17, '18': 18, '19': 19, '20': 20, '>20': 21}

job_change_df_timeexp_encoded = ordinal_map_encode(job_change_df_major_encoded, 'experience', timeexp_mapping)

"company_size" will be ordinally mapped for a very similar reason, because the natural order of population places a given, larger group above a smaller group, on a scale.

In [31]:
compsize_mapping = {'<10': 0, '10/49': 1, '50-99': 2, '100-500': 3, '500-999': 4, '1000-4999': 5, '5000-9999': 6, '10000+': 7}

job_change_df_compsize_encoded = ordinal_map_encode(job_change_df_timeexp_encoded, 'company_size', compsize_mapping)

"company_type" will be label encoded, as it is categorical—neccesitating conversion to integer values—but not ordinal.

In [32]:
job_change_df_comptype_encoded = label_encode(job_change_df_compsize_encoded, 'company_type')

"last_new_job" (measured in years since leaving most previous job and entering the current role) will be ordinally mapped, under the same logic as "experience".

In [33]:
lastjob_mapping = {'never': 0, '1': 1, '2': 2, '3': 3, '4': 4, '>4': 5}

job_change_df_encoded = ordinal_map_encode(job_change_df_comptype_encoded, 'last_new_job', lastjob_mapping)
job_change_df_encoded

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,5,0.920,1,1,2,2,5,21,3,5,1,36,1.0
1,29725,77,0.776,1,0,2,2,5,15,2,5,5,47,0.0
2,11561,64,0.624,1,0,0,2,5,5,7,5,0,83,0.0
3,33241,14,0.789,1,0,2,2,1,0,3,5,0,52,1.0
4,666,50,0.767,1,1,2,3,5,21,2,1,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,55,0.878,1,0,2,2,2,14,7,5,1,42,1.0
19154,31398,5,0.920,1,1,2,2,5,14,0,5,4,52,1.0
19155,24576,5,0.920,1,1,2,2,5,21,2,5,4,44,0.0
19156,5756,94,0.802,1,1,2,1,5,0,4,5,2,97,0.0


# Standardization:

Given we're predicting a binary outcome, the below ordinal columns will now need to be normalized:
1. 'relevent_experience'
2. 'education_level'
3. 'experience'
4. 'company_size'
5. 'last_new_job'
6. 'training_hours'

In [34]:
from sklearn.preprocessing import StandardScaler

def standardize_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Standardizes specified columns of a given DataFrame with StandardScaler.
    """
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

standardization_cols = ['relevent_experience', 'education_level', 'experience', 'company_size', 'last_new_job', 'training_hours']
job_change_df_standardized = standardize_columns(job_change_df_encoded, standardization_cols)
job_change_df_standardized

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,5,0.920,1,0.623752,2,-0.198328,5,1.611741,-0.115083,5,-0.597600,-0.488985,1.0
1,29725,77,0.776,1,-1.603202,2,-0.198328,5,0.751757,-0.570610,5,1.788069,-0.305825,0.0
2,11561,64,0.624,1,-1.603202,0,-0.198328,5,-0.681550,1.707028,5,-1.194018,0.293607,0.0
3,33241,14,0.789,1,-1.603202,2,-0.198328,1,-1.398204,-0.115083,5,-1.194018,-0.222571,1.0
4,666,50,0.767,1,0.623752,2,1.246375,5,1.611741,-0.570610,1,1.191652,-0.955209,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,55,0.878,1,-1.603202,2,-0.198328,2,0.608426,1.707028,5,-0.597600,-0.389079,1.0
19154,31398,5,0.920,1,0.623752,2,-0.198328,5,0.608426,-1.481666,5,1.191652,-0.222571,1.0
19155,24576,5,0.920,1,0.623752,2,-0.198328,5,1.611741,-0.570610,5,1.191652,-0.355778,0.0
19156,5756,94,0.802,1,0.623752,2,-1.643031,5,-1.398204,0.340445,5,-0.001183,0.526719,0.0


# Typographical Corrections:

Finally, let's fix the columns' grammatical errors, and make their titles both simpler and more representative of their data:

In [35]:
import pandas as pd

def rename_column(df: pd.DataFrame, old_name: str, new_name: str):
    """
    Rename a specified column in a DataFrame from an old name to a new name.
    """
    df.rename(columns = {old_name: new_name}, inplace = True)
    return df

job_change_df_id_change = rename_column(job_change_df_standardized, 'enrollee_id', 'id')
job_change_df_citydev_change = rename_column(job_change_df_id_change, 'city_development_index', 'city_dev_index')
job_change_df_relexp_change = rename_column(job_change_df_citydev_change, 'relevent_experience', 'has_relevant_exp')
job_change_school_change = rename_column(job_change_df_relexp_change, 'enrolled_university', 'in_school')
job_change_df_degree_change = rename_column(job_change_school_change, 'education_level', 'degree_lvl')
job_change_df_exp_change = rename_column(job_change_df_degree_change, 'experience', 'yrs_exp')
job_change_df_cleaned = rename_column(job_change_df_exp_change, 'last_new_job', 'yrs_since_last_job')
job_change_df_cleaned

Unnamed: 0,id,city,city_dev_index,gender,has_relevant_exp,in_school,degree_lvl,major_discipline,yrs_exp,company_size,company_type,yrs_since_last_job,training_hours,target
0,8949,5,0.920,1,0.623752,2,-0.198328,5,1.611741,-0.115083,5,-0.597600,-0.488985,1.0
1,29725,77,0.776,1,-1.603202,2,-0.198328,5,0.751757,-0.570610,5,1.788069,-0.305825,0.0
2,11561,64,0.624,1,-1.603202,0,-0.198328,5,-0.681550,1.707028,5,-1.194018,0.293607,0.0
3,33241,14,0.789,1,-1.603202,2,-0.198328,1,-1.398204,-0.115083,5,-1.194018,-0.222571,1.0
4,666,50,0.767,1,0.623752,2,1.246375,5,1.611741,-0.570610,1,1.191652,-0.955209,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,55,0.878,1,-1.603202,2,-0.198328,2,0.608426,1.707028,5,-0.597600,-0.389079,1.0
19154,31398,5,0.920,1,0.623752,2,-0.198328,5,0.608426,-1.481666,5,1.191652,-0.222571,1.0
19155,24576,5,0.920,1,0.623752,2,-0.198328,5,1.611741,-0.570610,5,1.191652,-0.355778,0.0
19156,5756,94,0.802,1,0.623752,2,-1.643031,5,-1.398204,0.340445,5,-0.001183,0.526719,0.0


### Saving DataFrame to CSV:

In [36]:
def df_to_csv(df: pd.DataFrame, file_path: str) -> None:
    """
    Saves a DataFrame to a CSV file.
    """
    df.to_csv(file_path, index = False)

df_to_csv(job_change_df_cleaned, 'job_change_df_cleaned.csv')