In [3]:
import pandas as pd

df = pd.read_csv("../data/dataset.csv")

print(df.shape)
df.head()


(1000, 23)


Unnamed: 0,Age,Gender,Location,Lead_Source,Interest_Level,Follow_Ups,Lead_Converted,Course,Course_Level,Course_Duration_Weeks,...,Performance_Score,PIP_Flag,Top_Performer_Flag,Joined,Employment_Type,Basic_Salary,HRA,TA,PF,CTC
0,26,Male,Poland,LinkedIn,2,2,0,Data Science,Advanced,4,...,82.0,0,0,0,Contract,2356,567.58,191.9,269.01,3384.49
1,39,Male,Spain,Referral,3,7,1,ML Engineering,Beginner,11,...,74.2,0,0,1,Contract,2165,477.44,258.42,220.17,3121.02
2,34,Male,France,LinkedIn,5,4,1,Business Analytics,Intermediate,23,...,73.4,0,0,1,Contract,2398,503.8,235.47,275.37,3412.64
3,30,Male,Spain,Website,4,7,1,Python,Advanced,19,...,80.8,0,0,1,Permanent,2764,621.65,260.84,314.73,3961.22
4,27,Male,Germany,Referral,5,2,1,HR Analytics,Beginner,16,...,66.2,0,0,1,Contract,2488,590.34,228.17,261.28,3567.79


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df_ml = df.copy()

# Encode categorical columns
cat_cols = ["Gender", "Location", "Lead_Source", "Course", "Course_Level", "Employment_Type"]

le = LabelEncoder()
for col in cat_cols:
    df_ml[col] = le.fit_transform(df_ml[col])

print(df_ml.head())


   Age  Gender  Location  Lead_Source  Interest_Level  Follow_Ups  \
0   26       1         3            1               2           2   
1   39       1         4            2               3           7   
2   34       1         0            1               5           4   
3   30       1         4            4               4           7   
4   27       1         1            2               5           2   

   Lead_Converted  Course  Course_Level  Course_Duration_Weeks  ...  \
0               0       1             0                      4  ...   
1               1       3             1                     11  ...   
2               1       0             2                     23  ...   
3               1       4             0                     19  ...   
4               1       2             1                     16  ...   

   Performance_Score  PIP_Flag  Top_Performer_Flag  Joined  Employment_Type  \
0               82.0         0                   0       0                0   


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = df_ml.drop("Lead_Converted", axis=1)
y = df_ml["Lead_Converted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

lead_model = LogisticRegression(max_iter=1000)
lead_model.fit(X_train, y_train)

y_pred = lead_model.predict(X_test)

print("Lead Conversion Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Lead Conversion Accuracy: 0.972
              precision    recall  f1-score   support

           0       0.96      0.95      0.96        81
           1       0.98      0.98      0.98       169

    accuracy                           0.97       250
   macro avg       0.97      0.97      0.97       250
weighted avg       0.97      0.97      0.97       250



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
from sklearn.ensemble import RandomForestClassifier

X = df_ml.drop("PIP_Flag", axis=1)
y = df_ml["PIP_Flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

pip_model = RandomForestClassifier(n_estimators=150, random_state=42)
pip_model.fit(X_train, y_train)

y_pred = pip_model.predict(X_test)

print("PIP Prediction Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


PIP Prediction Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       224
           1       1.00      1.00      1.00        26

    accuracy                           1.00       250
   macro avg       1.00      1.00      1.00       250
weighted avg       1.00      1.00      1.00       250



In [7]:
X = df_ml.drop("Employment_Type", axis=1)
y = df_ml["Employment_Type"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

emp_model = LogisticRegression(max_iter=1000)
emp_model.fit(X_train, y_train)

y_pred = emp_model.predict(X_test)

print("Employment Type Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Employment Type Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       201
           1       0.85      0.47      0.61        49

    accuracy                           0.88       250
   macro avg       0.87      0.72      0.77       250
weighted avg       0.88      0.88      0.87       250



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

X = df_ml.drop("CTC", axis=1)
y = df_ml["CTC"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

salary_model = LinearRegression()
salary_model.fit(X_train, y_train)

y_pred = salary_model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


MAE: 0.0037776529913699053
RMSE: 0.005873386080336205


In [10]:
import joblib

joblib.dump(lead_model, "../models/lead_model.pkl")
joblib.dump(pip_model, "../models/pip_model.pkl")
joblib.dump(salary_model, "../models/salary_model.pkl")


['../models/salary_model.pkl']