In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt

#Import target data

In [18]:
missing_values = ["n/a", "na", "--", "NONE", "None", "none", "NA", "N/A",'inf','-inf', '?', 'Null', 'NULL']
train_data = pd.read_csv('aug_train.csv', na_values = missing_values)
train_data.drop(['enrollee_id', 'city'], 1, inplace=True)
train_data.head()

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [19]:
train_data.shape

(19158, 12)

In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  19158 non-null  float64
 1   gender                  14650 non-null  object 
 2   relevent_experience     19158 non-null  object 
 3   enrolled_university     18772 non-null  object 
 4   education_level         18698 non-null  object 
 5   major_discipline        16345 non-null  object 
 6   experience              19093 non-null  object 
 7   company_size            13220 non-null  object 
 8   company_type            13018 non-null  object 
 9   last_new_job            18735 non-null  object 
 10  training_hours          19158 non-null  int64  
 11  target                  19158 non-null  float64
dtypes: float64(2), int64(1), object(9)
memory usage: 1.8+ MB


In the dataset, there is some Human error in column company size i.e. Oct-49 and in pandas it was printed as 10/49, so we need to convert into np.nan(NaN)

In [21]:
print(train_data.company_size.value_counts())
train_data['company_size'] = train_data['company_size'].replace('10/49', np.nan)
print("==============================")
print(train_data.company_size.value_counts())

50-99        3083
100-500      2571
10000+       2019
10/49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
Name: company_size, dtype: int64
50-99        3083
100-500      2571
10000+       2019
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
Name: company_size, dtype: int64


This is just to check how many unique values are present in each column

In [22]:
for col_name in train_data.columns:
  if (train_data[col_name].dtypes == 'int64' or train_data[col_name].dtypes == 'float64' or train_data[col_name].dtypes == 'object'):
    unique_cat = len(train_data[col_name].unique())
    print("Feature '{col_name}' has '{unique_cat}' unique categories".format(col_name = col_name, unique_cat = unique_cat))

Feature 'city_development_index' has '93' unique categories
Feature 'gender' has '4' unique categories
Feature 'relevent_experience' has '2' unique categories
Feature 'enrolled_university' has '4' unique categories
Feature 'education_level' has '6' unique categories
Feature 'major_discipline' has '7' unique categories
Feature 'experience' has '23' unique categories
Feature 'company_size' has '8' unique categories
Feature 'company_type' has '7' unique categories
Feature 'last_new_job' has '7' unique categories
Feature 'training_hours' has '241' unique categories
Feature 'target' has '2' unique categories


Checking the null values and we noticed 38% missing values in company size 32% in company type

In [23]:
train_data.isnull().sum()

city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              7409
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

#Label encoder

In [24]:
to_LabelEncode = train_data[['gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job']]

le = LabelEncoder()
train_temp = to_LabelEncode.astype("str").apply(le.fit_transform)
train_Label_encode = train_temp.where(~to_LabelEncode.isna(), to_LabelEncode)

In [25]:
train_data.drop(['gender', 'relevent_experience','enrolled_university', 'education_level', 'major_discipline',
                 'experience', 'company_size', 'company_type', 'last_new_job'],1,inplace=True)

In [26]:
train_data

Unnamed: 0,city_development_index,training_hours,target
0,0.920,36,1.0
1,0.776,47,0.0
2,0.624,83,0.0
3,0.789,52,1.0
4,0.767,8,0.0
...,...,...,...
19153,0.878,42,1.0
19154,0.920,52,1.0
19155,0.920,44,0.0
19156,0.802,97,0.0


In [27]:
train_data = train_Label_encode.join(train_data)
train_data

Unnamed: 0,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,city_development_index,training_hours,target
0,1,0,3,0,5,21,,,0,0.920,36,1.0
1,1,1,3,0,5,6,3,5,4,0.776,47,0.0
2,,1,0,0,5,15,,,6,0.624,83,0.0
3,,1,,0,1,20,,5,6,0.789,52,1.0
4,1,0,3,2,5,21,3,1,3,0.767,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19153,1,1,3,0,2,5,,,0,0.878,42,1.0
19154,1,0,3,0,5,5,,,3,0.920,52,1.0
19155,1,0,3,0,5,21,3,5,3,0.920,44,0.0
19156,1,0,3,1,,20,4,5,1,0.802,97,0.0


#Applying Mice Imputer for Missing value imputation

In [28]:
lr = LinearRegression()
mice_imputer = IterativeImputer(random_state=42, estimator=lr, max_iter=10, n_nearest_features=2, imputation_order = 'roman')
cleaned_train_data = mice_imputer.fit_transform(train_data)

cleaned_train_data = pd.DataFrame(cleaned_train_data)
cleaned_train_data.columns = ['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
                           'experience', 'company_size', 'company_type', 'last_new_job', 'city_development_index', 'training_hours', 'target']
                                                        
cleaned_train_data



Unnamed: 0,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,city_development_index,training_hours,target
0,1.000000,0.0,3.000000,0.0,5.000000,21.0,2.463052,4.274976,0.0,0.920,36.0,1.0
1,1.000000,1.0,3.000000,0.0,5.000000,6.0,3.000000,5.000000,4.0,0.776,47.0,0.0
2,0.939181,1.0,0.000000,0.0,5.000000,15.0,2.462749,4.129187,6.0,0.624,83.0,0.0
3,0.835441,1.0,1.318954,0.0,1.000000,20.0,2.302724,5.000000,6.0,0.789,52.0,1.0
4,1.000000,0.0,3.000000,2.0,5.000000,21.0,3.000000,1.000000,3.0,0.767,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19153,1.000000,1.0,3.000000,0.0,2.000000,5.0,2.485616,4.271062,0.0,0.878,42.0,1.0
19154,1.000000,0.0,3.000000,0.0,5.000000,5.0,2.382644,4.288925,3.0,0.920,52.0,1.0
19155,1.000000,0.0,3.000000,0.0,5.000000,21.0,3.000000,5.000000,3.0,0.920,44.0,0.0
19156,1.000000,0.0,3.000000,1.0,4.713418,20.0,4.000000,5.000000,1.0,0.802,97.0,0.0


Now we don't have any missing values

In [29]:
cleaned_train_data.isnull().sum()

gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
city_development_index    0
training_hours            0
target                    0
dtype: int64

#There is heavy class imbalance present in data

In [30]:
cleaned_train_data.target.value_counts()

0.0    14381
1.0     4777
Name: target, dtype: int64

In [31]:
cleaned_train_data.to_csv('cleaned_train_data.csv', index=False)