Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
warnings.filterwarnings('ignore')

Importing the dataset

In [3]:
dataset = pd.read_csv('Engineering_graduate_salary.csv',names=None)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]


Extracting the year out of the date of birth as the month and day create a redundancy 

In [4]:
X['DOB']=pd.to_datetime(X['DOB'])
X['DOB']=X['DOB'].dt.year
X['DOB']=2020-X['DOB']
X.rename(columns = {'DOB':'Age'},inplace=True)
X['Age']

0       30
1       30
2       31
3       29
4       30
        ..
2993    31
2994    29
2995    32
2996    32
2997    28
Name: Age, Length: 2998, dtype: int64

Dealing with cardinality issues in the categorical features

In [5]:
ten_board_freq = (X['10board'].value_counts())/X.shape[0]
botton_decile = ten_board_freq.quantile(q=0.95)
less_freq = ten_board_freq[ten_board_freq<=botton_decile]
X.loc[X['10board'].isin(less_freq.index.tolist()),'10board'] = "other"

In [6]:
X['10board'].value_counts()

cbse                            1026
state board                      881
other                            363
0                                256
icse                             213
ssc                               96
up board                          72
matriculation                     29
rbse                              20
wbbse                             14
up                                14
board of secondary education      14
Name: 10board, dtype: int64

In [7]:
twelve_board_freq = (X['12board'].value_counts())/X.shape[0]
botton_decile = twelve_board_freq.quantile(q=0.95)
less_freq = twelve_board_freq[twelve_board_freq<=botton_decile]
X.loc[X['12board'].isin(less_freq.index.tolist()),'12board'] = "other"

In [8]:
X['12board'].value_counts()

cbse                               1039
state board                         948
other                               443
0                                   264
icse                                101
up board                             74
isc                                  29
board of intermediate                27
board of intermediate education      21
rbse                                 16
up                                   14
chse                                 11
mp board                             11
Name: 12board, dtype: int64

In [9]:
degree_freq = (X['Degree'].value_counts())/X.shape[0]
botton_decile = degree_freq.quantile(q=0.5)
less_freq = degree_freq[degree_freq<=botton_decile]
X.loc[X['Degree'].isin(less_freq.index.tolist()),'Degree'] = "other"

In [10]:
X['Degree'].value_counts()

B.Tech/B.E.    2757
MCA             200
other            41
Name: Degree, dtype: int64

In [11]:
Specialization_freq = (X['Specialization'].value_counts())/X.shape[0]
botton_decile = Specialization_freq.quantile(q=0.75)
less_freq = Specialization_freq[Specialization_freq<=botton_decile]
X.loc[X['Specialization'].isin(less_freq.index.tolist()),'Specialization'] = "other"

In [12]:
X['Specialization'].value_counts()

electronics and communication engineering    670
computer science & engineering               557
information technology                       506
computer engineering                         415
computer application                         201
other                                        170
mechanical engineering                       155
electronics and electrical engineering       148
electronics & telecommunications              89
electrical engineering                        63
electronics & instrumentation eng             24
Name: Specialization, dtype: int64

In [13]:
CollegeState_freq = (X['CollegeState'].value_counts())/X.shape[0]
botton_decile = CollegeState_freq.quantile(q=0.45)
less_freq = CollegeState_freq[CollegeState_freq<=botton_decile]
X.loc[X['CollegeState'].isin(less_freq.index.tolist()),'CollegeState'] = "other"

In [14]:
X['10board'].value_counts()

cbse                            1026
state board                      881
other                            363
0                                256
icse                             213
ssc                               96
up board                          72
matriculation                     29
rbse                              20
wbbse                             14
up                                14
board of secondary education      14
Name: 10board, dtype: int64

Encoding categorical features

In [15]:
categorical_feature_mask = X.dtypes == object
categorical_cols = X.columns[categorical_feature_mask].tolist()
ohe = OneHotEncoder(sparse=False)
array_hot_encoded = ohe.fit_transform(X[categorical_cols])
X_hot_encoded = pd.DataFrame(array_hot_encoded)
X_hot_encoded.columns = ohe.get_feature_names(X[categorical_cols].columns)
X_other_cols = X.drop(columns=categorical_cols)
X = pd.concat([X_hot_encoded, X_other_cols], axis=1)

Obtaining the final processed dataset for Ml model training 

In [16]:
data_out = pd.concat([X, y], axis=1).reset_index(drop=True)

In [17]:
data_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 81 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Gender_f                                                  2998 non-null   float64
 1   Gender_m                                                  2998 non-null   float64
 2   10board_0                                                 2998 non-null   float64
 3   10board_board of secondary education                      2998 non-null   float64
 4   10board_cbse                                              2998 non-null   float64
 5   10board_icse                                              2998 non-null   float64
 6   10board_matriculation                                     2998 non-null   float64
 7   10board_other                                             2998 non-null   float64
 8   10board_rbse      

In [19]:
X0 = data_out.values
imputer = SimpleImputer(missing_values=-1, strategy='mean')
imputer.fit(X0[:, 67:75])
X0[:, 67:75]=imputer.transform(X0[:, 67:75])
data_out = pd.DataFrame(X0, columns=list(data_out.columns))

In [20]:
data_out

Unnamed: 0,Gender_f,Gender_m,10board_0,10board_board of secondary education,10board_cbse,10board_icse,10board_matriculation,10board_other,10board_rbse,10board_ssc,...,MechanicalEngg,ElectricalEngg,TelecomEngg,CivilEngg,conscientiousness,agreeableness,extraversion,nueroticism,openess_to_experience,Salary
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-0.1590,0.3789,1.2396,0.14590,0.2889,445000.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,1.1336,0.0459,1.2396,0.52620,-0.2859,110000.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,402.026738,423.336066,260.000000,338.807692,0.5100,-0.1232,1.5428,-0.29020,-0.2875,255000.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-0.4463,0.2124,0.3174,0.27270,0.4805,420000.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-1.4992,-0.7473,-1.0697,0.06223,0.1864,200000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2993,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-1.1901,0.9688,-1.0697,1.35490,0.0284,120000.0
2994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-0.1082,0.0328,-0.4891,-0.29020,0.5024,120000.0
2995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,-0.8810,0.1888,-0.3440,0.06230,0.6603,385000.0
2996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,402.026738,423.336066,349.879562,338.807692,1.4374,1.2808,-0.4891,-1.46537,0.5419,530000.0


In [21]:
data_out.to_csv('Engineering_graduate_salary_processed.csv')