In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score


In [48]:
# Load data
train = pd.read_csv("train_wns.csv")

In [49]:
train.isna().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,2409
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,4124
length_of_service,0


In [50]:


# Basic cleaning
train["education"] = train["education"].fillna("Unknown")
train["previous_year_rating"] = train["previous_year_rating"].fillna(
    train["previous_year_rating"].median()
)


In [51]:

X = train.drop(columns=["employee_id", "is_promoted"])
y = train["is_promoted"]

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1, random_state=42)
train_X.shape, val_X.shape, train_y.shape, val_y.shape


train_y.value_counts(), val_y.value_counts()

categorical_features = [
    "department", "region", "education",
    "gender", "recruitment_channel",
    "KPIs_met >80%", "awards_won?"
]

numerical_features = [
    "no_of_trainings", "age",
    "length_of_service", "avg_training_score",
    "previous_year_rating"
]

num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", num_tf, numerical_features),
    ("cat", cat_tf, categorical_features)
])

model_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

model_pipeline.fit(train_X, train_y)


In [86]:
X.columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')

In [52]:
model_pipeline.predict(val_X)

model_pipeline.predict_proba(val_X)

array([[0.95837652, 0.04162348],
       [0.92351725, 0.07648275],
       [0.80624325, 0.19375675],
       ...,
       [0.50782552, 0.49217448],
       [0.7835442 , 0.2164558 ],
       [0.85739763, 0.14260237]])

In [57]:
def model_train_cal_eval(train_X, train_y, val_X, val_y, model_pipeline):
  predicted_train_tgt = model_pipeline.predict(train_X)
  predicted_val_tgt = model_pipeline.predict(val_X)

  print('Train Score:',roc_auc_score(train_y, predicted_train_tgt))
  print('Val Score:',roc_auc_score(val_y, predicted_val_tgt))


model_train_cal_eval(train_X, train_y, val_X, val_y, model_pipeline)

Train Score: 0.7909464657080824
Val Score: 0.7994436148951322


In [55]:
from sklearn.metrics import f1_score

In [60]:
def model_train_f1_cal_eval(train_X, train_y, val_X, val_y, model_pipeline):
  predicted_train_tgt_f1 = model_pipeline.predict(train_X)
  predicted_val_tgt_f1 = model_pipeline.predict(val_X)

  print('Train Score:',f1_score(train_y, predicted_train_tgt_f1))
  print('Val Score:',f1_score(val_y, predicted_val_tgt_f1))


model_train_f1_cal_eval(train_X, train_y, val_X, val_y, model_pipeline)

Train Score: 0.3786742569379824
Val Score: 0.3640973630831643


In [61]:
test=pd.read_csv('test_wns.csv')
submission = pd.read_csv('sample_submission_wns.csv')

In [62]:
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [63]:
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [71]:
probs = model_pipeline.predict_proba(
    test.drop(columns=["employee_id"])
)[:, 1]

In [72]:
submission["is_promoted"] = (probs >= 0.5).astype(int)

In [73]:
submission.to_csv('sub1_logreg.csv',index=False)

In [74]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [75]:
param_1 = [
    {
        'model' : [LogisticRegression()],
        'model__penalty' : ['l2',None, 'l1','elasticnet'],
        'model__C' : [0.5, 3]
    },
    {
        'model' : [DecisionTreeClassifier()],
        'model__max_depth' : [3, 5]
    }
]

In [76]:
grid_1 = GridSearchCV(estimator=model_pipeline, param_grid=param_1, cv=2,scoring='roc_auc')

from imblearn.over_sampling import RandomOverSampler

over_sampling = RandomOverSampler()

train_X_os, train_y_os = over_sampling.fit_resample(train_X , train_y)

In [77]:
train_X.shape, train_y.shape

train_y.value_counts()

train_y_os.value_counts()

grid_1.fit(train_X_os, train_y_os)

grid_1.best_params_

grid_1.best_estimator_


In [78]:
res_df_1 = pd.DataFrame(grid_1.cv_results_)
#grid_1.cv_results_
pd.set_option('display.max_colwidth', 1000)
res_df_1[['param_model','params','mean_test_score','rank_test_score']]

Unnamed: 0,param_model,params,mean_test_score,rank_test_score
0,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 0.5, 'model__penalty': 'l2'}",0.878129,4
1,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 0.5, 'model__penalty': None}",0.878417,1
2,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 0.5, 'model__penalty': 'l1'}",,7
3,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 0.5, 'model__penalty': 'elasticnet'}",,7
4,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 3, 'model__penalty': 'l2'}",0.87838,3
5,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 3, 'model__penalty': None}",0.878417,1
6,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 3, 'model__penalty': 'l1'}",,7
7,LogisticRegression(),"{'model': LogisticRegression(), 'model__C': 3, 'model__penalty': 'elasticnet'}",,7
8,DecisionTreeClassifier(),"{'model': DecisionTreeClassifier(), 'model__max_depth': 3}",0.798652,6
9,DecisionTreeClassifier(),"{'model': DecisionTreeClassifier(), 'model__max_depth': 5}",0.848908,5


In [79]:
new_model = grid_1.best_estimator_

model_train_cal_eval(train_X_os, train_y_os, val_X, val_y, new_model)

Train Score: 0.7911177644710579
Val Score: 0.7998396544990926


In [80]:
probs2 = new_model.predict_proba(
    test.drop(columns=["employee_id"])
)[:, 1]

submission["is_promoted"] = (probs >= 0.5).astype(int)
submission.to_csv('sub2_grid_os.csv',index=False)


In [81]:
import joblib

In [82]:
joblib.dump(new_model, 'jobchg_pipeline_model.pkl')

['jobchg_pipeline_model.pkl']

In [83]:
!pip install streamlit



In [84]:
%%writefile webview.py
import streamlit as st
import pandas as pd
import joblib

st.title("HR Analytics")

df = pd.read_csv('train_wns.csv')

# input fields

#st.selectbox
#st.number_input

st.write("App is running successfully!")

categorical_features = [
    "department", "region", "education",
    "gender", "recruitment_channel",
    "KPIs_met >80%", "awards_won?"
]

numerical_features = [
    "no_of_trainings", "age",
    "length_of_service", "avg_training_score",
    "previous_year_rating"
]

department = st.selectbox("department", pd.unique(df["department"]))
region = st.selectbox("region", pd.unique(df["region"]))
education = st.selectbox("education", pd.unique(df["education"]))
gender = st.selectbox("gender", pd.unique(df["gender"]))
recruitment_channel = st.selectbox("recruitment_channel", pd.unique(df["recruitment_channel"]))
KPIs_met_80 = st.selectbox("KPIs_met >80%", pd.unique(df["KPIs_met >80%"]))
awards_won = st.selectbox("awards_won?", pd.unique(df["awards_won?"]))
no_of_trainings = st.number_input("no_of_trainings")
age = st.number_input("age")
length_of_service = st.number_input("length_of_service")
avg_training_score = st.number_input("avg_training_score")
previous_year_rating = st.number_input("previous_year_rating")

inputs = {
     "department": department,
     "region": region,
     "education": education,
     "gender": gender,
     "recruitment_channel": recruitment_channel,
     "KPIs_met >80%": KPIs_met_80,
      "awards_won?": awards_won,
      "no_of_trainings": no_of_trainings,
      "age": age,
      "length_of_service": length_of_service,
      "avg_training_score": avg_training_score,
      "previous_year_rating": previous_year_rating
}

if st.button("Predict"):
  model = joblib.load('jobchg_pipeline_model.pkl')
  X_input = pd.DataFrame([inputs])
  prediction = model.predict(X_input)
  st.write(prediction)

Overwriting webview.py


In [34]:
!curl ipecho.net/plain

34.72.237.94

In [35]:
!pip install pyngrok



In [85]:
!ngrok authtoken 378l5pU31Uxxdenmv3f1MlxqX7C_7YvAK9bLRfyy9S7LMfuRn

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!streamlit run webview.py & ngrok http 8501 --log stdout

In [91]:
%%writefile app.py
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import pandas as pd

# Load model ONCE
model = joblib.load("jobchg_pipeline_model.pkl")

app = FastAPI(title="HR Analytics Promotion Predictor")

# ---------- Input Schema ----------
class Input(BaseModel):
    department: str
    region: str
    education: str
    gender: str
    recruitment_channel: str
    KPIs_met_80: int
    awards_won: int
    no_of_trainings: int
    age: int
    length_of_service: int
    avg_training_score: float
    previous_year_rating: float

# ---------- Output Schema ----------
class Output(BaseModel):
    is_promoted: int

# ---------- Routes ----------
@app.get("/")
def home():
    return {"message": "FastAPI is running"}

@app.post("/predict", response_model=Output)
def predict(data: Input):

    # Create input DataFrame with training column names
    X_input = pd.DataFrame([{
        "department": data.department,
        "region": data.region,
        "education": data.education,
        "gender": data.gender,
        "recruitment_channel": data.recruitment_channel,
        "KPIs_met >80%": data.KPIs_met_80,
        "awards_won?": data.awards_won,
        "no_of_trainings": data.no_of_trainings,
        "age": data.age,
        "length_of_service": data.length_of_service,
        "avg_training_score": data.avg_training_score,
        "previous_year_rating": data.previous_year_rating
    }])

    prediction = model.predict(X_input)[0]

    return Output(is_promoted=int(prediction))

Overwriting app.py


In [92]:
!uvicorn main:app --host 0.0.0.0 --port 8000


[31mERROR[0m:    Error loading ASGI app. Could not import module "main".
