In [1]:
import pandas as pd
import numpy as np
import configparser

In [2]:
from Adjuct_Memory import memoryManager
from Scripts.Final_script import *

In [5]:
config = configparser.ConfigParser()
config.read(r'Config/config.ini')

['Config/config.ini']

In [6]:
# load dataset
data = pd.read_csv(config['Training_Data']['file_path'] +'\\'+ config['Training_Data']['file_name'])
data_t = pd.read_csv(config['Testing_Data']['file_path'] +'\\'+ config['Testing_Data']['file_name'])
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# Dropping unwanted data from both train and test dataset

In [7]:
# drop id, it is just a number given to identify each house
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data_t.drop(['Unnamed: 0'], axis=1, inplace=True)
data.shape,data_t.shape

((150000, 11), (101503, 11))

# Reducing the dataset size

In [8]:
train_data = memoryManager.reduce_mem_usage(data)
test_data = memoryManager.reduce_mem_usage(data_t)

Memory usage of dataframe is 12.59 MB
Memory usage after optimization is: 2.72 MB
Decreased by 78.4%
Memory usage of dataframe is 8.52 MB
Memory usage after optimization is: 2.52 MB
Decreased by 70.5%


In [13]:
train_data = pd.concat([dataPreparation(train_data),train_data[['SeriousDlqin2yrs']]],axis=1)

In [66]:
train_data.columns

Index(['RevolvingUtilizationOfUnsecuredLines', 'Weighted_Delay_sum', 'age_cat',
       'MonthlyIncome', 'DebtRatio', 'NumberOfDependentsTR',
       'NumberRealEstateLoansOrLinesTR', 'NumberOfOpenCreditLinesAndLoansTR',
       'SeriousDlqin2yrs'],
      dtype='object')

In [18]:
train_data

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,Weighted_Delay_sum,age_cat,MonthlyIncome,DebtRatio,NumberOfDependentsTR,NumberRealEstateLoansOrLinesTR,NumberOfOpenCreditLinesAndLoansTR,SeriousDlqin2yrs
0,0.766113,0.4,3,9120.0,0.802982,2.0,6,13,1
1,0.957031,0.0,3,2600.0,0.121876,1.0,0,4,0
2,0.658203,0.7,3,3042.0,0.085113,0.0,0,2,0
3,0.233765,0.0,2,3300.0,0.036050,0.0,0,5,0
4,0.907227,0.2,3,63588.0,0.024926,0.0,1,7,0
...,...,...,...,...,...,...,...,...,...
149995,0.040680,0.0,4,2100.0,0.225131,0.0,1,4,0
149996,0.299805,0.0,3,5584.0,0.716562,2.0,1,4,0
149997,0.246094,0.0,3,,3870.000000,0.0,1,18,0
149998,0.000000,0.0,2,5716.0,0.000000,0.0,0,4,0


In [14]:
test_data = dataPreparation(test_data)

# Modelling

In [16]:
import pickle
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

In [20]:
X = train_data.drop('SeriousDlqin2yrs',axis=1)
y = train_data.SeriousDlqin2yrs

Xtest = test_data

In [21]:
sc = StandardScaler()
X_transformed = sc.fit_transform(X)
Xtest_transformed = sc.transform(Xtest)

In [22]:
X_transformed.shape

(150000, 8)

In [23]:
Xtest_transformed.shape

(101503, 8)

In [24]:
model_lgbm = LGBMClassifier(class_weight={1:0.55,0:0.45},random_state=0)

In [25]:
model_lgbm = model_lgbm.fit(X_transformed,y)

In [26]:
model_lgbm_pred = model_lgbm.predict_proba(Xtest_transformed)
model_lgbm_pred

array([[0.9278189 , 0.0721811 ],
       [0.93678992, 0.06321008],
       [0.98168607, 0.01831393],
       ...,
       [0.99446509, 0.00553491],
       [0.89371484, 0.10628516],
       [0.9513848 , 0.0486152 ]])

# Modelling using Pipeline

In [30]:
pipe = make_pipeline(StandardScaler(), LGBMClassifier(class_weight={1:0.55,0:0.45},random_state=0))

In [31]:
pipe_model = pipe.fit(X,y)

In [32]:
pipe_model_pred = pipe.predict_proba(Xtest)
pipe_model_pred

array([[0.9278189 , 0.0721811 ],
       [0.93678992, 0.06321008],
       [0.98168607, 0.01831393],
       ...,
       [0.99446509, 0.00553491],
       [0.89371484, 0.10628516],
       [0.9513848 , 0.0486152 ]])

In [33]:
filename = r'Model\finalized_model.sav'
pickle.dump(pipe_model, open(filename, 'wb'))

# Modelling with MlFlow (MLOps)

In [39]:
import mlflow

In [40]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [42]:
mlflow.list_experiments()

[<Experiment: artifact_location='artifact-root/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>]

In [44]:
from sklearn.metrics import accuracy_score,f1_score

mlflow.set_experiment("Project_CreditScoring")

with mlflow.start_run():

    pipe = make_pipeline(StandardScaler(), LGBMClassifier(class_weight={1:0.55,0:0.45},random_state=0))
    pipe_model = pipe.fit(X,y)
    
    y_pred = pipe_model.predict(Xtest)
    
    mlflow.sklearn.log_model(pipe_model, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



default artifacts URI: 'artifact-root/1/e8c1f5a3a43e4310884d0591daae4f57/artifacts'


### Interacting with the model registry

In [45]:
from mlflow.tracking import MlflowClient
client = MlflowClient("http://127.0.0.1:5000")

In [46]:
client.list_registered_models()

[]

In [51]:
client.list_run_infos(experiment_id='1')[0]

<RunInfo: artifact_uri='artifact-root/1/e8c1f5a3a43e4310884d0591daae4f57/artifacts', end_time=1670764605032, experiment_id='1', lifecycle_stage='active', run_id='e8c1f5a3a43e4310884d0591daae4f57', run_uuid='e8c1f5a3a43e4310884d0591daae4f57', start_time=1670764582311, status='FINISHED', user_id='meet9'>

In [48]:
run_id = client.list_run_infos(experiment_id='1')[0].run_id
run_id

'e8c1f5a3a43e4310884d0591daae4f57'

In [49]:
mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name='PredictProbofDefaulters'
)

Successfully registered model 'PredictProbofDefaulters'.
2022/12/11 18:48:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: PredictProbofDefaulters, version 1
Created version '1' of model 'PredictProbofDefaulters'.


<ModelVersion: creation_timestamp=1670764717650, current_stage='None', description='', last_updated_timestamp=1670764717650, name='PredictProbofDefaulters', run_id='e8c1f5a3a43e4310884d0591daae4f57', run_link='', source='artifact-root/1/e8c1f5a3a43e4310884d0591daae4f57/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [59]:
logged_model = 'runs:/e8c1f5a3a43e4310884d0591daae4f57/models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
np.unique(loaded_model.predict(pd.DataFrame(Xtest)),return_counts=True)

(array([0, 1], dtype=int8), array([98624,  2879], dtype=int64))

In [58]:
np.unique(pipe.predict(Xtest),return_counts=True)

(array([0, 1], dtype=int8), array([98624,  2879], dtype=int64))

# Other way round 

In [62]:
model_path = r'artifact-root/1/e8c1f5a3a43e4310884d0591daae4f57/artifacts/models'

In [65]:
loaded_model = mlflow.pyfunc.load_model(model_path)
# Predict on a Pandas DataFrame.
np.unique(loaded_model.predict(pd.DataFrame(Xtest)),return_counts=True)

(array([0, 1], dtype=int8), array([98624,  2879], dtype=int64))