In [1]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import TargetEncoder
from sklearn.feature_selection import f_classif, mutual_info_classif, RFE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import classification_report

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('data_science_job.csv')
df.drop('enrollee_id', axis=1, inplace=True)

In [3]:
df.isna().sum()

city                         0
city_development_index     479
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
training_hours             766
target                       0
dtype: int64

In [4]:
# Fill nan values
df['city_development_index'] = df['city_development_index'].fillna(df['city_development_index'].median())
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df['education_level'] = df['education_level'].fillna(df['education_level'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['experience'] = df['experience'].fillna(df['experience'].median())
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])
df['training_hours'] = df['training_hours'].fillna(df['training_hours'].median())

In [5]:
df.isna().sum()

city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
training_hours            0
target                    0
dtype: int64

In [6]:
df['target'] = df['target'].astype('boolean')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  object 
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  object 
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  float64
 8   company_size            19158 non-null  object 
 9   company_type            19158 non-null  object 
 10  training_hours          19158 non-null  float64
 11  target                  19158 non-null  boolean
dtypes: boolean(1), float64(3), object(8)
memory usage: 1.6+ MB


In [8]:
tg = TargetEncoder(smooth='auto', target_type='binary')
y = df['target']
x = df.drop(list(df.select_dtypes(exclude='object').columns), axis=1)

x_trans = tg.fit_transform(x, y)
df_encoded = pd.DataFrame(x_trans, columns=x.columns)

In [9]:
df_encoded['target'] = y
df_encoded

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,company_size,company_type,target
0,0.211707,0.248080,0.216106,0.213735,0.278731,0.250740,0.330122,0.261464,True
1,0.170925,0.248217,0.332470,0.215169,0.276710,0.250398,0.326318,0.259696,False
2,0.587675,0.248080,0.335021,0.382131,0.278731,0.250740,0.330122,0.261464,False
3,0.345299,0.248080,0.335021,0.213735,0.278731,0.265090,0.330122,0.261464,True
4,0.319188,0.248080,0.216106,0.213735,0.213457,0.250740,0.330122,0.129746,False
...,...,...,...,...,...,...,...,...,...
19153,0.124565,0.248342,0.340873,0.216304,0.274444,0.202280,0.324627,0.260566,True
19154,0.213385,0.248855,0.212402,0.211689,0.278982,0.250018,0.329488,0.260766,True
19155,0.213319,0.247694,0.214752,0.214710,0.279810,0.251587,0.327235,0.260695,False
19156,0.130955,0.248342,0.213972,0.216304,0.201393,0.251227,0.163150,0.260566,False


In [10]:
fig = px.imshow(
    df_encoded.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [11]:
final_df = df
final_df[df_encoded.columns] = df_encoded
final_df

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,0.211707,0.920,0.248080,0.216106,0.213735,0.278731,0.250740,20.0,0.330122,0.261464,36.0,True
1,0.170925,0.776,0.248217,0.332470,0.215169,0.276710,0.250398,15.0,0.326318,0.259696,47.0,False
2,0.587675,0.624,0.248080,0.335021,0.382131,0.278731,0.250740,5.0,0.330122,0.261464,83.0,False
3,0.345299,0.789,0.248080,0.335021,0.213735,0.278731,0.265090,0.0,0.330122,0.261464,52.0,True
4,0.319188,0.767,0.248080,0.216106,0.213735,0.213457,0.250740,20.0,0.330122,0.129746,8.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
19153,0.124565,0.878,0.248342,0.340873,0.216304,0.274444,0.202280,14.0,0.324627,0.260566,42.0,True
19154,0.213385,0.920,0.248855,0.212402,0.211689,0.278982,0.250018,14.0,0.329488,0.260766,52.0,True
19155,0.213319,0.920,0.247694,0.214752,0.214710,0.279810,0.251587,20.0,0.327235,0.260695,44.0,False
19156,0.130955,0.802,0.248342,0.213972,0.216304,0.201393,0.251227,0.0,0.163150,0.260566,97.0,False


In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    19158 non-null  float64
 1   city_development_index  19158 non-null  float64
 2   gender                  19158 non-null  float64
 3   relevent_experience     19158 non-null  float64
 4   enrolled_university     19158 non-null  float64
 5   education_level         19158 non-null  float64
 6   major_discipline        19158 non-null  float64
 7   experience              19158 non-null  float64
 8   company_size            19158 non-null  float64
 9   company_type            19158 non-null  float64
 10  training_hours          19158 non-null  float64
 11  target                  19158 non-null  boolean
dtypes: boolean(1), float64(11)
memory usage: 1.6 MB


In [13]:
# Part 1 Task 1
fig = px.imshow(
    final_df.corr(), text_auto=True, aspect='auto'
)
fig.show()

In [14]:
# Part 1 Task 2
def rforest_feature_selection(threshold = 0.1):
    model = xgb.XGBRFClassifier()
    x = final_df.drop('target', axis=1)
    y = final_df['target']
    model.fit(x, y)
    features_importances = dict(zip(x.columns.to_list(), list(model.feature_importances_)))
    return dict(filter(lambda pair: pair[1] >= threshold, features_importances.items()))
rforest_feature_selection(0.02)

{'city': 0.45746398,
 'city_development_index': 0.12793338,
 'relevent_experience': 0.056673724,
 'enrolled_university': 0.04256852,
 'education_level': 0.049435545,
 'company_size': 0.1736853,
 'company_type': 0.04254239}

In [15]:
# Part 1 Task 3
def p_value_feature_selection():
    x = final_df.drop('target', axis=1)
    y = final_df['target']
    f_test = f_classif(x, y)
    p_values = f_test[1]
    features_p_values = dict(zip(x.columns.to_list(), list(p_values)))
    return dict(filter(lambda pair: pair[1] < 0.05, features_p_values.items()))
p_value_feature_selection()

{'city': 0.0,
 'city_development_index': 0.0,
 'relevent_experience': 5.162093013693399e-70,
 'enrolled_university': 8.830957032209763e-98,
 'education_level': 2.069559403348117e-34,
 'experience': 4.372211179054333e-136,
 'company_size': 2.8218687949395375e-129,
 'company_type': 5.429812083876602e-19,
 'training_hours': 0.0034057479488734804}

In [16]:
# Part 1 Task 4
def mutual_info_feature_selection(percentile=0.5):
    x = final_df.drop('target', axis=1)
    y = final_df['target']
    features_importances = mutual_info_classif(x, y)    
    features_importances = dict(zip(x.columns.to_list(), list(features_importances)))
    features_importances = dict(sorted(features_importances.items(),
                                       key=lambda pair: pair[0])[:round(len(features_importances)*percentile)])
    return features_importances    
mutual_info_feature_selection()

{'city': 0.07180882649862652,
 'city_development_index': 0.06445761146440487,
 'company_size': 0.0157881752944804,
 'company_type': 0.0021791471068941526,
 'education_level': 0.004823256632876749,
 'enrolled_university': 0.010657292227569393}

In [18]:
# Part 1 Task 5
def rfe_feature_selection(percentile=0.5):
    x = final_df.drop('target', axis=1)
    y = final_df['target']    
    model = RFE(estimator=SVC(kernel='linear'), n_features_to_select=percentile)
    model.fit(x, y)
    return {feature: True for feature in model.get_feature_names_out()}
rfe_feature_selection()


{'city': True,
 'city_development_index': True,
 'enrolled_university': True,
 'company_size': True,
 'company_type': True}

In [17]:
rfe_selected = {'city': True,
                'city_development_index': True,
                'enrolled_university': True,
                'company_size': True,
                'company_type': True}

In [20]:
# Part 1 Task 6
# Random Forest
model = xgb.XGBClassifier()
x = final_df[list(rforest_feature_selection().keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86      2877
         1.0       0.56      0.41      0.47       955

    accuracy                           0.77      3832
   macro avg       0.69      0.65      0.66      3832
weighted avg       0.76      0.77      0.76      3832



In [21]:
# Part 1 Task 6
# P-Value
model = xgb.XGBClassifier()
x = final_df[list(p_value_feature_selection().keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.86      2877
         1.0       0.56      0.42      0.48       955

    accuracy                           0.77      3832
   macro avg       0.69      0.65      0.67      3832
weighted avg       0.76      0.77      0.76      3832



In [22]:
# Part 1 Task 6
# Mutual Info
model = xgb.XGBClassifier()
x = final_df[list(mutual_info_feature_selection().keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.86      2877
         1.0       0.56      0.42      0.48       955

    accuracy                           0.77      3832
   macro avg       0.69      0.66      0.67      3832
weighted avg       0.76      0.77      0.76      3832



In [23]:
# Part 1 Task 6
# RFE
model = xgb.XGBClassifier()
x = final_df[list(rfe_selected.keys())]
y = final_df['target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
model.fit(x_train, y_train)
pred = model.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.82      0.89      0.85      2877
         1.0       0.55      0.41      0.47       955

    accuracy                           0.77      3832
   macro avg       0.69      0.65      0.66      3832
weighted avg       0.75      0.77      0.76      3832



In [None]:
# Mutual info is the best fow now