In [1]:
import os 
import warnings
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns




In [2]:
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.model_selection import train_test_split 

from sklearn.linear_model import LinearRegression

In [3]:
#  we are using the housing data 
#and we have already saved the training and testing data through previous excercise 

train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [4]:
train_data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,5.485836,0.205784,3.168555,True,False,False,False
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,6.927083,0.160714,2.623698,False,False,False,True


In [6]:
import mlflow 
import mlflow.sklearn

In [187]:
# Setting up the server URL for the MLflow server that is running on our system

remote_server_uri  = 'http://127.0.0.1:5000'

mlflow.set_tracking_uri(remote_server_uri)

In [189]:
#check weather it is set up or not 

mlflow.tracking.get_tracking_uri()

'http://127.0.0.1:5000'

In [191]:
# setting up the experiment 
exp_name  = "MLflow-walkthrough"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='mlflow-artifacts:/899675162326840953', creation_time=1722543858183, experiment_id='899675162326840953', last_update_time=1722543858183, lifecycle_stage='active', name='MLflow-walkthrough', tags={}>

In [28]:
def load_data(train_data_path , test_data_path):
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)

    train_x = train_data.drop('median_house_value' , axis = 1)
    train_y = train_data['median_house_value']

    test_x = train_data.drop('median_house_value' , axis = 1)
    test_y = train_data['median_house_value']

    return train_x, train_y , test_x , test_y



def eval_metric(actual , pred):
    rmse = np.sqrt(mean_squared_error(actual , pred)) 
    r2 = r2_score(actual , pred)
    return rmse , r2



    

In [67]:
def train_model():
    train_data_path = 'data/train.csv'
    test_data_path = 'data/test.csv'

    train_x , train_y , test_x , test_y = load_data(train_data_path  , test_data_path)
    
    run_name = "Linear_Regression_Run"

    with mlflow.start_run(run_name = run_name) as run:

        lr = LinearRegression()
        lr.fit(train_x, train_y)

        pred = lr.predict(train_x)

        rmse  , r2 = eval_metric(train_y, pred)

        bias  = lr.intercept_
        weigths = lr.coef_



        mlflow.log_param("Coeficient" ,weigths )
        mlflow.log_param("Intercept" , bias)

        mlflow.log_metric("RMSe_loss" , rmse)
        mlflow.log_metric("R2_score" , r2)

        mlflow.sklearn.log_model(lr, "model")
        
        

    

In [69]:
train_model()

**Decision Tree**

In [81]:
from sklearn.tree import DecisionTreeRegressor


In [139]:
def train_tree_model(max_depth = None ):
    train_data_path = 'data/train.csv'
    test_data_path = 'data/test.csv'

    train_x , train_y , test_x , test_y = load_data(train_data_path  , test_data_path)

    tree_model  = DecisionTreeRegressor(max_depth= max_depth )
    
    tree_model.fit(train_x , train_y)
    pred = tree_model.predict(train_x)

    rmse , r2 = eval_metric(train_y , pred)
    
    run_name  = "Decision_Tree_depth:{}".format(max_depth)

    features = train_x.columns
    feature_importnace = tree_model.feature_importances_
    feature_importnace_tab = pd.DataFrame({"feature" : features , "Importance" : feature_importnace})
    feature_importance_df = feature_importnace_tab.sort_values(by="Importance", ascending=False) 

    with mlflow.start_run(run_name = run_name) as run:

        mlflow.log_param("max_depth" , max_depth)
        
        mlflow.log_metric("RMSe_loss" , rmse)
        mlflow.log_metric("R2_score" , r2)

        
        mlflow.sklearn.log_model(tree_model, "model")
        
        feature_importance_file_path =  os.path.join(os.getcwd() , "feature_importance_depth:{}.csv".format(max_depth))
        feature_importance_df.to_csv(feature_importance_file_path, index =False)
        mlflow.log_artifact(feature_importance_csv_path)
    
    

    

In [141]:
train_tree_model()

In [143]:
train_tree_model(20)

In [145]:
train_tree_model(30)

In [193]:
train_tree_model(32)