# IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn

import os
import tarfile
import urllib

import time
import warnings

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

In [3]:
warnings.filterwarnings('ignore')

np.random.seed(42)

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 70)

%matplotlib inline

In [4]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000
remote_server_uri = "http://0.0.0.0:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

exp_name = "ML_Housing"
mlflow.set_experiment(exp_name)

2022/01/31 10:14:26 INFO mlflow.tracking.fluent: Experiment with name 'ML_Housing' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlruns/6', experiment_id='6', lifecycle_stage='active', name='ML_Housing', tags={}>

# DATA

## LOADING DATA

In [5]:
def load_raw_data( 
    housing_url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz", 
    housing_path = os.path.join("datasets", "housing")
):
    
    #fetching housing data
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
    #loading the data as a Dataframe
    csv_path = os.path.join(housing_path, "housing.csv")
    df = pd.read_csv(csv_path)
    
    mlflow.log_artifact(csv_path)
    
    return df

## TRAIN TEST SPLIT

In [6]:
def train_test(df):
    #creating training and test set
    df["income_cat"] = pd.cut(
        df["median_income"], 
        bins=[0., 1.5, 3.0, 4.5, 6., np.inf], 
        labels=[1, 2, 3, 4, 5]
    )
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(df, df["income_cat"]):
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]
        
        for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)
            
    mlflow.log_metric("training_nrows", len(strat_train_set))
    mlflow.log_metric("test_nrows", len(strat_test_set))
    
    return strat_train_set, strat_test_set

## DATA DESCRIPTION

In [7]:
def data_description(df, train):
    #creating a file containing data description
    with open("data_description.txt", 'w') as f:
        f.write(f"""
        'Original DataFrame'
        Value count of ocean proximity:
        {df["ocean_proximity"].value_counts()}
        
        'Training data'
        Feature: {list(train.columns)}
        Shape: {train.shape}
        
        Data description:
        {train.describe()}
        
        Correlation:
        {train.corr()["median_house_value"].round(2)}
        """)
        
    mlflow.log_artifact("data_description.txt")

## FINAL FUNCTION

In [8]:
def data_loading():
    #to upload the artifacts and metrics to mlrun server
    with mlflow.start_run(run_name= "Data_loading_and_split", nested=True) as child_run_load:
        df = load_raw_data()
        train, test = train_test(df)
        data_description(df, train)
    return train, test

# MODELLING

## BASIC MODEL

In [9]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [10]:
def eval_matrics(model, train_x, train_y):
    #function to get r2 score using cross_val_score
    scores = cross_val_score(model, train_x, train_y, 
                            scoring= "r2", cv= 10)
    return scores.mean()

def basic_modeling(train):
    housing = train.drop("median_house_value", axis=1)
    housing_labels = train["median_house_value"].copy()
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), 
        ('attribs_adder', CombinedAttributesAdder()), 
        ('std_scaler', StandardScaler()),
    ])
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(housing.drop("ocean_proximity", axis=1))),
        ("cat", OneHotEncoder(), ["ocean_proximity"]),
    ])

    housing_prepared = full_pipeline.fit_transform(housing)
    
    models = {
        "Linear_reg": LinearRegression(), 
        "Decision_tree": DecisionTreeRegressor(), 
        "Random_forest": RandomForestRegressor()
    }
    
    with mlflow.start_run(run_name= "Basic_model", nested=True) as child_run_basic:
        for model in models:
            mlflow.log_metric(
                f"{model}_R2_Score", 
                eval_matrics(models[model], housing_prepared, housing_labels)
            )
    return housing_prepared, housing_labels, full_pipeline

## FINE TUNING

In [11]:
def model_search(housing_prepared, housing_labels):
    params_grid = [
        {"n_estimators" : [3, 10 ,30, 100, 300], "max_features" : [2, 4, 6, 8, 10]},
        {"bootstrap" : [0], "n_estimators" : [3, 10, 30, 100], "max_features" : [2, 3, 4, 6]}
    ]
    
    forest_reg = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(forest_reg, params_grid, cv=5, 
                               scoring = "r2",
                               return_train_score=True, 
                               verbose= 3)
    grid_search.fit(housing_prepared, housing_labels)
    
    with mlflow.start_run(run_name= "Best_model", nested=True) as child_run_model:
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("Best_score", grid_search.best_score_)
        mlflow.sklearn.log_model(grid_search.best_estimator_, "model")
        
    return grid_search.best_estimator_, grid_search.best_score_

## TEST SET

In [12]:
def test_set(test, full_pipeline, model):
    x_test = test.drop("median_house_value", axis = 1)
    y_test = test["median_house_value"].copy()

    x_test_prepared = full_pipeline.transform(x_test)
    y_final_pred = model.predict(x_test_prepared)

    #r2 error
    r2 = r2_score(y_test, y_final_pred)
    print(f"Final R2 score of the model is : {r2.round(3)}")

    with mlflow.start_run(run_name= "Test set", nested= True) as child_run_test:
        mlflow.log_metric("R2_score", r2)
        
    return r2

# MAIN

In [18]:
def main():
    with mlflow.start_run(run_name= "ML_LIFECYCLE") as parent_run:
        train, test = data_loading()
        housing_prepared, housing_labels, full_pipeline = basic_modeling(train)
        final_model, score = model_search(housing_prepared, housing_labels)
        test_set(test, full_pipeline, final_model)
        mlflow.sklearn.log_model(final_model, "model")

In [19]:
main()

Fitting 5 folds for each of 41 candidates, totalling 205 fits
[CV 1/5] END max_features=2, n_estimators=3;, score=(train=0.914, test=0.689) total time=   0.2s
[CV 2/5] END max_features=2, n_estimators=3;, score=(train=0.919, test=0.718) total time=   0.2s
[CV 3/5] END max_features=2, n_estimators=3;, score=(train=0.914, test=0.684) total time=   0.2s
[CV 4/5] END max_features=2, n_estimators=3;, score=(train=0.916, test=0.694) total time=   0.2s
[CV 5/5] END max_features=2, n_estimators=3;, score=(train=0.918, test=0.689) total time=   0.2s
[CV 1/5] END max_features=2, n_estimators=10;, score=(train=0.955, test=0.776) total time=   0.6s
[CV 2/5] END max_features=2, n_estimators=10;, score=(train=0.956, test=0.788) total time=   0.6s
[CV 3/5] END max_features=2, n_estimators=10;, score=(train=0.954, test=0.770) total time=   0.6s
[CV 4/5] END max_features=2, n_estimators=10;, score=(train=0.957, test=0.768) total time=   0.6s
[CV 5/5] END max_features=2, n_estimators=10;, score=(train=0

[CV 4/5] END max_features=8, n_estimators=10;, score=(train=0.962, test=0.788) total time=   1.1s
[CV 5/5] END max_features=8, n_estimators=10;, score=(train=0.960, test=0.793) total time=   1.1s
[CV 1/5] END max_features=8, n_estimators=30;, score=(train=0.972, test=0.812) total time=   3.4s
[CV 2/5] END max_features=8, n_estimators=30;, score=(train=0.971, test=0.815) total time=   3.3s
[CV 3/5] END max_features=8, n_estimators=30;, score=(train=0.971, test=0.823) total time=   3.3s
[CV 4/5] END max_features=8, n_estimators=30;, score=(train=0.972, test=0.809) total time=   3.3s
[CV 5/5] END max_features=8, n_estimators=30;, score=(train=0.971, test=0.811) total time=   3.4s
[CV 1/5] END max_features=8, n_estimators=100;, score=(train=0.974, test=0.815) total time=  11.2s
[CV 2/5] END max_features=8, n_estimators=100;, score=(train=0.974, test=0.823) total time=  11.2s
[CV 3/5] END max_features=8, n_estimators=100;, score=(train=0.974, test=0.826) total time=  11.4s
[CV 4/5] END max_

[CV 3/5] END bootstrap=0, max_features=3, n_estimators=100;, score=(train=1.000, test=0.825) total time=   8.2s
[CV 4/5] END bootstrap=0, max_features=3, n_estimators=100;, score=(train=1.000, test=0.813) total time=   8.4s
[CV 5/5] END bootstrap=0, max_features=3, n_estimators=100;, score=(train=1.000, test=0.815) total time=   7.8s
[CV 1/5] END bootstrap=0, max_features=4, n_estimators=3;, score=(train=1.000, test=0.732) total time=   0.3s
[CV 2/5] END bootstrap=0, max_features=4, n_estimators=3;, score=(train=1.000, test=0.749) total time=   0.3s
[CV 3/5] END bootstrap=0, max_features=4, n_estimators=3;, score=(train=1.000, test=0.760) total time=   0.3s
[CV 4/5] END bootstrap=0, max_features=4, n_estimators=3;, score=(train=1.000, test=0.752) total time=   0.3s
[CV 5/5] END bootstrap=0, max_features=4, n_estimators=3;, score=(train=1.000, test=0.756) total time=   0.3s
[CV 1/5] END bootstrap=0, max_features=4, n_estimators=10;, score=(train=1.000, test=0.797) total time=   1.0s
[CV

In [21]:
print("end")

end
