In [5]:
import sys
print(sys.executable)

/usr/bin/python3


In [6]:
!python --version

Python 3.9.12


### Create functions for all the steps involved in complete model training lifecycle

In [7]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [8]:
def load_data(path):
    data = pd.read_csv(path)
    return data

In [9]:
data = load_data("banking.csv")
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


In [10]:
def data_cleaning(data):
    print("na values available in data \n")
    print(data.isna().sum())
    data = data.dropna()
    print("after droping na values \n")
    print(data.isna().sum())
    return data

In [11]:
def preprocessing(data):
    data['education']=np.where(data['education'] =='basic.9y', 'Basic', data['education'])
    data['education']=np.where(data['education'] =='basic.6y', 'Basic', data['education'])
    data['education']=np.where(data['education'] =='basic.4y', 'Basic', data['education'])
    
    cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
    for var in cat_vars:
        cat_list='var'+'_'+var
        cat_list = pd.get_dummies(data[var], prefix=var)
        data1=data.join(cat_list)
        data=data1

    cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
    data_vars=data.columns.values.tolist()
    to_keep=[i for i in data_vars if i not in cat_vars]
    
    final_data=data[to_keep]
    
    
    final_data.columns = final_data.columns.str.replace('.','_')
    final_data.columns = final_data.columns.str.replace(' ','_')
    return final_data

In [12]:
def train_test_split(final_data):
    from sklearn.model_selection import train_test_split
    X = final_data.loc[:, final_data.columns != 'y']
    y = final_data.loc[:, final_data.columns == 'y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify = y, random_state=47)
    return X_train, X_test, y_train, y_test

In [13]:
def over_sampling_target_class(X_train, y_train):
    ### Over-sampling using SMOTE 
    from imblearn.over_sampling import SMOTE
    os = SMOTE(random_state=0)

    columns = X_train.columns
    os_data_X,os_data_y=os.fit_resample(X_train, y_train)

    os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
    os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
    # we can Check the numbers of our data
    print("length of oversampled data is ",len(os_data_X))
    print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
    print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
    print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
    print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))
    
    X_train = os_data_X
    y_train = os_data_y['y']
 
    return X_train, y_train

In [14]:
def training_basic_classifier(X_train,y_train):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=101)
    model.fit(X_train, y_train)
    
    return model

In [15]:
def predict_on_test_data(model,X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [16]:
def get_metrics(y_true, y_pred, y_pred_prob):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    entropy = log_loss(y_true, y_pred_prob)
    return {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)}

In [17]:
data = load_data('banking.csv')

In [18]:
cleaned_data = data_cleaning(data)

na values available in data 

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64
after droping na values 

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64


In [19]:
final_data = preprocessing(cleaned_data)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(final_data)

In [21]:
X_train, y_train = over_sampling_target_class(X_train, y_train)

length of oversampled data is  51166
Number of no subscription in oversampled data 25583
Number of subscription 25583
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


In [22]:
model = training_basic_classifier(X_train,y_train)

In [23]:
y_pred = predict_on_test_data(model,X_test)

In [24]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

### MLFlow work Starts from here

In [None]:
# import mlflow
###### Not working :(

# mlflow.set_tracking_uri("sqlite:///mlflow.db")
# mlflow.set_experiment("practice")

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss

In [28]:
import mlflow

mlflow.set_tracking_uri("http://0.0.0.0:1234")
mlflow.set_experiment("practice")

<Experiment: artifact_location='/home/programmer/artifacts/4', creation_time=1685768079068, experiment_id='4', last_update_time=1685768079068, lifecycle_stage='active', name='practice', tags={}>

In [None]:
def run1():
    '''basic practice of mlflow'''
    with mlflow.start_run():
        n_est = 80
        model = RandomForestClassifier(n_estimators=n_est)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acu = accuracy_score(y_test, y_pred)

        mlflow.log_param("n_estimators",n_est)
        mlflow.log_metric("acu", acu)

In [39]:
#run1()

In [None]:
def run2():
    ''' Specify the run name'''
    with mlflow.start_run(run_name="Random Forest Classifier") as mlops_run:
        n_est = 100
        model = RandomForestClassifier(n_estimators=n_est)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acu = accuracy_score(y_test, y_pred)

        mlflow.log_param("n_estimators",n_est)
        mlflow.log_metric("acu", acu)

In [None]:
#run2()

**Hyperparameter tunning**

In [52]:
from sklearn.model_selection import RandomizedSearchCV

**To customize logging, use mlflow. autolog(). This function provides configuration parameters to enable model logging ( log_models ), collect input examples ( log_input_examples ), configure warnings ( silent ), and more.**

In [58]:
def run3():
    with mlflow.start_run(run_name="RF-Classifier-Params") as mlops_run:
        # Define a grid of hyperparameters
        grid = {"n_estimators": [10, 100, 200, 500],
                'criterion': ['gini', 'entropy', 'log_loss'],
                "max_depth": [None, 5, 10, 20, 30],
                "min_samples_split": [2, 4, 6],
                "min_samples_leaf": [1, 2, 4]
               }

        rs_RFC = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                    param_distributions=grid,
                                    n_iter=2,
                                    verbose=True)

        rs_RFC.fit(X_train, y_train)
        
        #mlflow.sklearn.autolog()

        run_params = rs_RFC.best_params_
        for param in run_params:
            mlflow.log_param(param, run_params[param])

        model = RandomForestClassifier(**run_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acu = accuracy_score(y_test, y_pred)
        
        mlflow.log_metric("acu", acu)

In [59]:
run3()

Fitting 5 folds for each of 2 candidates, totalling 10 fits


