In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Datasets/processed_data.csv')

In [3]:
df = df.drop(columns=['Unnamed: 0', 'PROSPECTID'])

In [4]:
y = df['Approved_Flag']
X = df.drop(columns=['Approved_Flag'])

In [5]:
classes = y.unique()
classes.sort()

In [6]:
nominal_cols = ['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [7]:
ordinal_cols = ['EDUCATION']
order = {'OTHERS':1, 'SSC':2, '12TH':3, 'UNDER GRADUATE':4, 'GRADUATE':5, 'POST-GRADUATE':6, 'PROFESSIONAL':7}

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [9]:
cols = X.columns.to_list()
nominal_cols = ['MARITALSTATUS', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

In [10]:
for i,v in enumerate(cols):
    if v in nominal_cols:
        print(i)

46
49
63
64


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [13]:
trf1 = ColumnTransformer([
    ('or_education', OrdinalEncoder(categories=[['OTHERS', 'SSC', '12TH', 'UNDER GRADUATE', 'GRADUATE', 'POST-GRADUATE', 'PROFESSIONAL']]), [47]),
    ('ohe', OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'), [46, 49, 63, 64])
], remainder='passthrough')

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [16]:
trf2 = XGBClassifier(
                        objective='multi:softmax', 
                        num_class=4,
                        colsample_bytree=0.3,
                        learning_rate=0.1,
                        max_depth=5,
                        alpha=1,
                        n_estimators=100
                    )

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2)
])

In [19]:
pipe.fit(X_train, y_train)

In [20]:
y_pred_test = pipe.predict(X_test)

In [21]:
accuracy_score(y_test, y_pred_test)

0.9957428693060877

In [23]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_test)
for i,v in enumerate(classes):
    print(f"Class: {v}")
    print(f"Precision: {precision[i]}")
    print(f"recall: {recall[i]}")
    print(f"f1_score: {f1_score[i]}")
    print()

Class: P1
Precision: 0.9933460076045627
recall: 0.9693877551020408
f1_score: 0.9812206572769953

Class: P2
Precision: 1.0
recall: 1.0
f1_score: 1.0

Class: P3
Precision: 0.9762589928057553
recall: 0.9948680351906158
f1_score: 0.9854756717501816

Class: P4
Precision: 1.0
recall: 1.0
f1_score: 1.0



In [24]:
import pickle

In [25]:
pickle.dump(pipe, open('model.pkl', 'wb'))

In [27]:
pipe = pickle.load(open('model.pkl', 'rb'))

In [28]:
y_pred_test = pipe.predict(X_test)

In [29]:
accuracy_score(y_test, y_pred_test)

0.9957428693060877