In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.3->mlflow)
  Downloading databricks_sdk-0.49.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 k

In [2]:
!mlflow

Usage: mlflow [OPTIONS] COMMAND [ARGS]...

Options:
  --version  Show the version and exit.
  --help     Show this message and exit.

Commands:
  artifacts    Upload, list, and download artifacts from an MLflow...
  db           Commands for managing an MLflow tracking database.
  deployments  Deploy MLflow models to custom targets.
  doctor       Prints out useful information for debugging issues with MLflow.
  experiments  Manage experiments.
  gc           Permanently delete runs in the `deleted` lifecycle stage.
  models       Deploy MLflow models locally.
  recipes      MLflow Recipes is deprecated and will be removed in MLflow...
  run          Run an MLflow project from the given URI.
  runs         Manage runs.
  sagemaker    Serve models on SageMaker.
  server       Run the MLflow tracking server.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import mlflow
import mlflow.sklearn

In [4]:
data=pd.read_csv(r'/content/train.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
data=data.drop(['Loan_ID'],axis=1)
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
data['Dependents'].unique()
data['Credit_History'].unique()

array([ 1.,  0., nan])

In [8]:
data['Dependents']=data['Dependents'].replace('3+','3')
data['Dependents'].unique()

array(['0', '1', '2', '3', nan], dtype=object)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [10]:
#changing the data type of some columns
data['Loan_Amount_Term']=data['Loan_Amount_Term'].astype(str)
data['Credit_History']=data['Credit_History'].astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   614 non-null    object 
 9   Credit_History     614 non-null    object 
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(2), int64(1), object(9)
memory usage: 57.7+ KB


In [11]:
data.isnull().sum()

Unnamed: 0,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,0
Credit_History,0


In [12]:
# filling the null values
from sklearn.impute import SimpleImputer
for i in data.columns:
  if data[i].dtypes=='object':
    imputer=SimpleImputer(strategy='most_frequent')
    data[i]=imputer.fit_transform(data[[i]]).ravel()
  else:
    imputer=SimpleImputer(strategy='mean')
    data[i]=imputer.fit_transform(data[[i]]).ravel()

# data[[i]] gives a 2D DataFrame (required by SimpleImputer)

# .ravel() converts the output back to a 1D array to fit into the column properly

# imputer=SimpleImputer(strategy='most_frequent')
# data['Gender']=imputer.fit_transform(data[['Gender']])
# data['Married']=imputer.fit_transform(data[['Married']])
data.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [13]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban,Y


In [14]:
# Encoding and normalization
cat=data.select_dtypes(include='object').columns
num=data.select_dtypes(exclude='object').columns
for i in data.columns:
  if i in cat:
    unique=data[i].unique()
    k=0
    d={}
    for j in unique:
      d[j]=k
      k+=1
    data[i]=data[i].map(d)
  elif i in num:
    mean=data[i].mean()
    sd=data[i].std()
    x=[]
    for j in data[i]:
      z=(j-mean)/sd
      x.append(z)
    data[i]=x

data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,0,0,0.072931,-0.554036,0.0,0,0,0,0
1,0,1,1,0,0,-0.134302,-0.0387,-0.219095,0,0,1,1
2,0,1,0,0,1,-0.393427,-0.554036,-0.956861,0,0,0,0
3,0,1,0,1,0,-0.461686,0.251774,-0.31429,0,0,0,0
4,0,0,0,0,0,0.097649,-0.554036,-0.064402,0,0,0,0


##**Start MLflow**

In [15]:
mlflow.set_experiment('/mlops/Loan_status_dt')

2025/04/08 09:14:20 INFO mlflow.tracking.fluent: Experiment with name '/mlops/Loan_status_dt' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/317353410587823838', creation_time=1744103660291, experiment_id='317353410587823838', last_update_time=1744103660291, lifecycle_stage='active', name='/mlops/Loan_status_dt', tags={}>

In [16]:
print(mlflow.set_experiment('/mlops/Loan_status_dt').experiment_id)
print(mlflow.set_experiment('/mlops/Loan_status_dt').lifecycle_stage)
print(mlflow.set_experiment('/mlops/Loan_status_dt').name)

317353410587823838
active
/mlops/Loan_status_dt


In [19]:
def train_tree(criterion, max_depth, min_samples_split):
  #splitting the train test split
  X=data.drop(['Loan_Status'],axis=1)
  y=data['Loan_Status']
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
  X_train.shape,y_train.shape,X_test.shape,y_test.shape


  crt={0:'gini',1:'entropy'}
   # initiate mlflow
  with mlflow.start_run(experiment_id=317353410587823838,run_name='Classification',description='Decision Tree Classifier'):
    # Calling the model
    dt=DecisionTreeClassifier(criterion=crt[criterion],max_depth=max_depth,min_samples_split=min_samples_split)
    dt.fit(X_train,y_train)

    # prediction
    y_pred=dt.predict(X_test)

    # Evaluation
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)

    print('accuracy',accuracy)
    print('precision',precision)
    print('recall',recall)
    print('f1',f1)

    #log metrices ,parameters
    mlflow.log_param('criterion',crt[criterion])
    mlflow.log_param('max_depth',max_depth)
    mlflow.log_param('min_samples_split',min_samples_split)

    mlflow.log_metric('accuracy',accuracy)
    mlflow.log_metric('precision',precision)
    mlflow.log_metric('recall',recall)
    mlflow.log_metric('f1',f1)

    mlflow.sklearn.log_model(dt,'model',registered_model_name='DecisionTreeClassifier')



In [20]:
#Version 1
criterion=eval(input('enter 0 for gini,1 for entropy:'))
max_depth=eval(input('enter the maximum depth of the tree:'))
min_samples_split=eval(input('enter the minimum number of samples required to split an internal node:'))
train_tree(criterion,max_depth,min_samples_split)

enter 0 for gini,1 for entropy:0
enter the maximum depth of the tree:3
enter the minimum number of samples required to split an internal node:2
accuracy 0.7642276422764228
precision 0.8181818181818182
recall 0.4186046511627907
f1 0.5538461538461539


Successfully registered model 'DecisionTreeClassifier'.
Created version '1' of model 'DecisionTreeClassifier'.


In [21]:
#Version 2
criterion=eval(input('enter 0 for gini,1 for entropy:'))
max_depth=eval(input('enter the maximum depth of the tree:'))
min_samples_split=eval(input('enter the minimum number of samples required to split an internal node:'))
train_tree(criterion,max_depth,min_samples_split)

enter 0 for gini,1 for entropy:0
enter the maximum depth of the tree:5
enter the minimum number of samples required to split an internal node:3
accuracy 0.7560975609756098
precision 0.8421052631578947
recall 0.37209302325581395
f1 0.5161290322580645


Registered model 'DecisionTreeClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'DecisionTreeClassifier'.


**Creating a Tunnel**

In [22]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [23]:
from pyngrok import ngrok
# if any open tunnels are there then close that tunnel
ngrok.kill()

auth_token='2qVdKwzryIkN214CK81f6q1byfO_7bN1ouSyzNqxQf2tKM2Fi'
ngrok.set_auth_token(auth_token)
ngrok_tunnel=ngrok.connect(addr='5000',proto='http')
print('Tracking uri',ngrok_tunnel.public_url)

Tracking uri https://83dc-34-106-47-159.ngrok-free.app


In [25]:
!mlflow ui

[2025-04-08 09:20:33 +0000] [2849] [INFO] Starting gunicorn 23.0.0
[2025-04-08 09:20:33 +0000] [2849] [INFO] Listening at: http://127.0.0.1:5000 (2849)
[2025-04-08 09:20:33 +0000] [2849] [INFO] Using worker: sync
[2025-04-08 09:20:33 +0000] [2850] [INFO] Booting worker with pid: 2850
[2025-04-08 09:20:33 +0000] [2851] [INFO] Booting worker with pid: 2851
[2025-04-08 09:20:33 +0000] [2852] [INFO] Booting worker with pid: 2852
[2025-04-08 09:20:33 +0000] [2853] [INFO] Booting worker with pid: 2853
[2025-04-08 09:23:25 +0000] [2849] [INFO] Handling signal: int

Aborted!
[2025-04-08 09:23:25 +0000] [2853] [INFO] Worker exiting (pid: 2853)
[2025-04-08 09:23:25 +0000] [2851] [INFO] Worker exiting (pid: 2851)
[2025-04-08 09:23:25 +0000] [2852] [INFO] Worker exiting (pid: 2852)
[2025-04-08 09:23:25 +0000] [2850] [INFO] Worker exiting (pid: 2850)
[2025-04-08 09:23:27 +0000] [2849] [INFO] Shutting down: Master


#**Prediction on test dataset**

In [41]:
#Encoding the test datasets
data=pd.read_csv(r'/content/test.csv')
id=data['Loan_ID']
data=data.drop(['Loan_ID'],axis=1)
cat=data.select_dtypes(include='object').columns
num=data.select_dtypes(exclude='object').columns
for i in data.columns:
  if i in cat:
    unique=data[i].unique()
    k=0
    d={}
    for j in unique:
      d[j]=k
      k+=1
    data[i]=data[i].map(d)
  elif i in num:
    mean=data[i].mean()
    sd=data[i].std()
    x=[]
    for j in data[i]:
      z=(j-mean)/sd
      x.append(z)
    data[i]=x

data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0,0,0,0,0,0.186206,-0.672417,-0.425844,0.26801,0.459177,0
1,0,0,1,0,0,-0.352211,-0.029808,-0.165116,0.26801,0.459177,0
2,0,0,2,0,0,0.039587,0.098714,1.171115,0.26801,0.459177,0
3,0,0,2,0,0,-0.502089,0.418306,-0.588799,0.26801,,0
4,0,1,0,1,0,-0.311484,-0.672417,-0.947299,0.26801,0.459177,0


In [45]:
import mlflow
logged_model = 'runs:/e322709e01a74307a449ec76e4c5cf92/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
l=loaded_model.predict(pd.DataFrame(data))
l

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [59]:
li=['Y' if l[i]==0 else 'N' for i in range(len(l))]
status=pd.DataFrame(li,columns=['Loan_Status'])
result = pd.concat([id,status], axis=1)
result

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


# Developing a UI for loan Status

In [64]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [74]:
%%writefile app.py
import streamlit as st
import mlflow
import pandas as pd
st.title('Loan_prediction using MLflow')
file=st.file_uploader('Upload a CSV or excel file',type=['csv','xlsx'])

if file is not None:
  #Encoding the test datasets
  try:
        if file.name.endswith('.csv'):
            data = pd.read_csv(file)
        else:
            data = pd.read_excel(file)
        id=data['Loan_ID']
        data=data.drop(['Loan_ID'],axis=1)
        cat=data.select_dtypes(include='object').columns
        num=data.select_dtypes(exclude='object').columns
        for i in data.columns:
          if i in cat:
            unique=data[i].unique()
            k=0
            d={}
            for j in unique:
              d[j]=k
              k+=1
            data[i]=data[i].map(d)
          elif i in num:
            mean=data[i].mean()
            sd=data[i].std()
            x=[]
            for j in data[i]:
              z=(j-mean)/sd
              x.append(z)
            data[i]=x


        logged_model = 'runs:/e322709e01a74307a449ec76e4c5cf92/model'

        # Load model as a PyFuncModel.
        loaded_model = mlflow.pyfunc.load_model(logged_model)

        # Predict on a Pandas DataFrame.
        l=loaded_model.predict(data)
        li=['Y' if l[i]==0 else 'N' for i in range(len(l))]
        status=pd.DataFrame(li,columns=['Loan_Status'])
        result = pd.concat([id,status], axis=1)
        st.subheader("Prediction Results")
        st.dataframe(result)

  except Exception as e:
        st.error(f"An error occurred: {e}")

Overwriting app.py


In [75]:
from pyngrok import ngrok
outh_token='2qVdKwzryIkN214CK81f6q1byfO_7bN1ouSyzNqxQf2tKM2Fi' #we will not provide any token
ngrok.set_auth_token(outh_token)

#create the tunnel
ngrok_tunnel=ngrok.connect(addr='5000',proto='http')
print('Tracking uri:',ngrok_tunnel.public_url)

!streamlit run --server.port 5000 app.py

Tracking uri: https://bf7a-34-106-47-159.ngrok-free.app

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:5000[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:5000[0m
[34m  External URL: [0m[1mhttp://34.106.47.159:5000[0m
[0m




[34m  Stopping...[0m
[34m  Stopping...[0m
