In [99]:
#import general-libraires

import pandas as pd
import numpy as np


In [100]:
#import Preprocessing-libraires

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [101]:
#import ML-libraires
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [102]:
#import Joblib-libraires
import joblib

In [103]:
#Reading CSV(data) files using pandas
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [104]:
#Initial observation of dataset

In [105]:
# to see no.of.cols & rows in the dataset
train.shape,test.shape,submission.shape

((54808, 14), (23490, 13), (23490, 2))

In [106]:
# to check nulls or missing values in train dataset
train.isna().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [107]:
train.head(2)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0


In [108]:
# identifying target & ignore cols

tgt_col = ['is_promoted']
ign_cols = ['employee_id']

In [109]:
train.select_dtypes(include='object').columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel'], dtype='object')

In [110]:
train.select_dtypes(exclude='object').columns

Index(['employee_id', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')

In [147]:
# identifying categorical & numerical cols

cat_cols = train.select_dtypes(include='object').columns
num_cols = train.select_dtypes(exclude='object').columns

In [112]:
print(tgt_col,ign_cols,cat_cols,num_cols,sep='\n')

['is_promoted']
['employee_id']
Index(['department', 'region', 'education', 'gender', 'recruitment_channel'], dtype='object')
Index(['employee_id', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')


In [113]:
#Displaying cat & num cols

In [114]:
train[cat_cols].head(5)

Unnamed: 0,department,region,education,gender,recruitment_channel
0,Sales & Marketing,region_7,Master's & above,f,sourcing
1,Operations,region_22,Bachelor's,m,other
2,Sales & Marketing,region_19,Bachelor's,m,sourcing
3,Sales & Marketing,region_23,Bachelor's,m,other
4,Technology,region_26,Bachelor's,m,other


In [115]:
train[num_cols].head(5)

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,1,35,5.0,8,1,0,49,0
1,65141,1,30,5.0,4,0,0,60,0
2,7513,1,34,3.0,7,0,0,50,0
3,2542,2,39,1.0,10,0,0,50,0
4,48945,1,45,3.0,2,0,0,73,0


In [116]:
train[num_cols].drop(columns=tgt_col+ign_cols).head(5)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,1,35,5.0,8,1,0,49
1,1,30,5.0,4,0,0,60
2,1,34,3.0,7,0,0,50
3,2,39,1.0,10,0,0,50
4,1,45,3.0,2,0,0,73


In [149]:
# to see numerical cols after removing target & ignore cols
num_cols = train[num_cols].drop(columns=tgt_col+ign_cols).columns
num_cols

Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')

In [118]:
train[num_cols].head(5)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,1,35,5.0,8,1,0,49
1,1,30,5.0,4,0,0,60
2,1,34,3.0,7,0,0,50
3,2,39,1.0,10,0,0,50
4,1,45,3.0,2,0,0,73


In [166]:
# Creating pipelines

In [168]:
#categorical pipeline 

cat_pipe_encode = Pipeline(steps=[
    ('impute_cat',SimpleImputer(strategy='most_frequent')), #  imputing missing categorical values
    ('ohe',OneHotEncoder(handle_unknown='ignore')) # categorical encoding
])

In [176]:
#Numerical pipeline

num_pipe_encode = Pipeline(steps=[
    ('impute_num',SimpleImputer(strategy='median')), #  imputing missing numerical values
    ('scale',StandardScaler()) # Scaling numerical data
])

In [180]:
# create map b/w pipelines and columns

preporcess = ColumnTransformer(
    transformers=[
        ('cat_encode',cat_pipe_encode,cat_cols), # Categorical col mapping with cat_pipe_encode pipeline
        ('num_encode',num_pipe_encode,num_cols) # numerical col mapping with num_pipe_encode pipeline
    ]
)

In [248]:
# Assigning ML - Algorithm to a variable
mymodel = LogisticRegression()

In [244]:
# Incorporating Preprocessing and ML-Modeling into a one-single pipeline.
model_pipeline = Pipeline (
    steps=[
        ('preporcess',preporcess), # preporcessing
        ('model',mymodel) #ML-Modeling - the will be taken from the variable mymodel
    ]
)

In [246]:
model_pipeline

In [196]:
# Segreating X & y cols 
X = train.drop(columns=tgt_col+ign_cols)
X.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60


In [200]:
y = train['is_promoted']
y.head(2)

0    0
1    0
Name: is_promoted, dtype: int64

In [208]:
# Segreating data for train & test
train_X,val_X,train_y,val_y=train_test_split(X,y,test_size=0.1,random_state=42)
train_X.shape,val_X.shape,train_y.shape,val_y.shape

((49327, 12), (5481, 12), (49327,), (5481,))

In [226]:
# Mathmetical cross-check of the split %
train.shape,int(train.shape[0]*.9),int(train.shape[0]*.1)

((54808, 14), 49327, 5480)

In [257]:
# import warnings to ignore (or) supress warnings
import warnings
warnings.filterwarnings('ignore')

#import metrics libraries to evaluate the model
from sklearn.metrics import f1_score

In [259]:
# fiting the model
model_pipeline.fit(train_X,train_y)

In [261]:
# predicting the model
model_pipeline.predict(train_X)

array([0, 0, 0, ..., 0, 0, 0])

In [267]:
# model evaluation fuction for reusablitiy and metrics calculation using f1_score metrics
def model_train_val_eval (train_X,val_X,train_y,val_y,model_pipeline):
    pred_train = model_pipeline.predict(train_X)
    pred_test = model_pipeline.predict(val_X)

    print ('Train f1 score:',f1_score(train_y,pred_train))
    print ('Test  f1 score:',f1_score(val_y,pred_test))
    

In [269]:
model_train_val_eval(train_X,val_X,train_y,val_y,model_pipeline)

Train f1 score: 0.3970745629682483
Test  f1 score: 0.40418118466898956


In [271]:
# model prediction with actual test data.
model_pipeline.predict(test)

array([0, 0, 0, ..., 0, 0, 1])

In [275]:
submission.head(2)

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0


In [278]:
#assigning the predicted value to submission data
submission['is_promoted'] = model_pipeline.predict(test)

In [280]:
submission.head(2)

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0


In [284]:
# Writing predicted data to CSV 
submission.to_csv('zia1.csv',index=False)

In [308]:
#After submitting zia1.csv to analyticsvidhya we've recieved the f1_score 0.38 and the rank is #3205

![image.png](attachment:a0633aa2-1f8c-48c4-8cc1-6ecb24c4be78.png)
![image.png](attachment:a14253a4-4ffa-443d-99dc-470da1b702f1.png)

In [310]:
train_X.columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')

In [312]:
test.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score'],
      dtype='object')

In [314]:
# This is not working as train_X is having less columns comparing to test dataframe
train_X.columns.difference(test.columns)

Index([], dtype='object')

In [316]:
# This is working as test is having more columns comparing to train_X dataframe
test.columns.difference(train_X.columns)

Index(['employee_id'], dtype='object')

In [337]:
train.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [339]:
#creating pickle file from model model_pipeline and save it into the pickle file
joblib.dump(model_pipeline,'promote_pipeline_model.pkl')

['promote_pipeline_model.pkl']