In [1]:
from pathlib import Path

import pandas as pd
import sys
sys.path.append('..')
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datetime import datetime

import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

pd.set_option("display.max_columns", None)

In [3]:

from joblib import dump,load
from sklearn.ensemble import RandomForestClassifier
from src.app.preprocess import preprocess
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from pathlib import Path
DATA_DIR = Path(r'../../../AI_Methodology/AI_METHODOLY/data/external/')
PREDICTION_DIR = Path(r'../../../AI_Methodology/AI_METHODOLY/data/Predictions/')
MODEL_DIR=Path(r'../../../AI_Methodology/AI_METHODOLY/models/')
df_master= pd.read_excel(DATA_DIR/"Employee_Perfomance.xls",index_col='EmpNumber')



In [4]:
def build_model(data: pd.DataFrame):
    target = data['PerformanceRating']
    train = data.drop(['PerformanceRating'], axis=1)
    processedtrain= preprocess(train, MODEL_DIR)
    x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
    rfc.fit(x_train, y_train)
    dump(rfc,MODEL_DIR/'RFC.joblib')
    predict = rfc.predict(x_test)
    print(f"The accuracy score is : {accuracy_score(y_test, predict) * 100}%")
    print (classification_report(y_test,predict))
    return   

In [5]:
build_model(df_master)

The accuracy score is : 91.25%
              precision    recall  f1-score   support

           2       0.91      0.79      0.85        52
           3       0.91      0.98      0.94       173
           4       1.00      0.60      0.75        15

    accuracy                           0.91       240
   macro avg       0.94      0.79      0.85       240
weighted avg       0.91      0.91      0.91       240



In [6]:
def inference(data: pd.DataFrame,max_depth):
    target = data['PerformanceRating']
    train = data.drop(['PerformanceRating'], axis=1)
    processedtrain= preprocess(train, MODEL_DIR)
    x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    
    rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=max_depth)
    rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Test mse = {mse}, Test RMSE = {rmse}, Random forest max depth = {max_depth}")
    return rfc, mse, rmse

In [7]:
_=inference(df_master,2)

Test mse = 0.30416666666666664, Test RMSE = 0.5515130702591432, Random forest max depth = 2


In [8]:
for max_depth in range(2, 7, 2):
    _ = inference(df_master, max_depth=max_depth)

Test mse = 0.2375, Test RMSE = 0.48733971724044817, Random forest max depth = 2
Test mse = 0.275, Test RMSE = 0.5244044240850758, Random forest max depth = 4
Test mse = 0.2, Test RMSE = 0.4472135954999579, Random forest max depth = 6


## MLFLOW

In [9]:
randomforest_exp = "prediction_with_random_forest"
mlflow.set_experiment(randomforest_exp)

2022/07/30 17:19:11 INFO mlflow.tracking.fluent: Experiment with name 'prediction_with_random_forest' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/Anand/AI_Methodology/AI_METHODOLY/MLFLOW/mlruns/1', experiment_id='1', lifecycle_stage='active', name='prediction_with_random_forest', tags={}>

In [10]:
ls


 Volume in drive C is Windows
 Volume Serial Number is 549E-A9F0

 Directory of C:\Users\Anand\AI_Methodology\AI_METHODOLY\MLFLOW

30-07-2022  17:19    <DIR>          .
30-07-2022  17:19    <DIR>          ..
30-07-2022  17:11    <DIR>          .ipynb_checkpoints
30-07-2022  17:18            39,539 MLFLOW.ipynb
30-07-2022  17:19    <DIR>          mlruns
               1 File(s)         39,539 bytes
               4 Dir(s)  15,345,770,496 bytes free


In [11]:
def train_model(data: pd.DataFrame,max_depth):
    with mlflow.start_run():
        
        
        target = data['PerformanceRating']
        train = data.drop(['PerformanceRating'], axis=1)
        processedtrain= preprocess(train, MODEL_DIR)
        x_train , x_test , y_train , y_test = train_test_split(processedtrain,target,test_size=0.2)
    
        rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=max_depth)
        rfc.fit(x_train, y_train)
        mlflow.log_param("max_depth", max_depth)
        mlflow.sklearn.log_model(rfc, "model")
        y_pred = rfc.predict(x_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metrics({"testing_mse": mse, "testing_rmse": rmse})
        print(f"Test mse = {mse}, Test RMSE = {rmse}, Random forest max depth = {max_depth}")

In [12]:
for max_depth in range(2, 7, 2):
    _ = train_model(df_master, max_depth=max_depth)

Test mse = 0.2375, Test RMSE = 0.48733971724044817, Random forest max depth = 2
Test mse = 0.2875, Test RMSE = 0.5361902647381804, Random forest max depth = 4
Test mse = 0.225, Test RMSE = 0.4743416490252569, Random forest max depth = 6


In [13]:
mlflow.get_experiment_by_name(randomforest_exp)

<Experiment: artifact_location='file:///C:/Users/Anand/AI_Methodology/AI_METHODOLY/MLFLOW/mlruns/1', experiment_id='1', lifecycle_stage='active', name='prediction_with_random_forest', tags={}>

In [14]:
experiment_id = mlflow.get_experiment_by_name(randomforest_exp).experiment_id
experiment_id

'1'

In [15]:
mlflow.search_runs(experiment_id)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.testing_rmse,metrics.testing_mse,params.max_depth,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.log-model.history
0,caaaf489d0464fd09131ad2a04ba0ff3,1,FINISHED,file:///C:/Users/Anand/AI_Methodology/AI_METHO...,2022-07-30 15:20:32.801000+00:00,2022-07-30 15:20:57.492000+00:00,0.474342,0.225,6,LOCAL,Anand,C:\Users\Anand\.conda\envs\aimethodology\lib\s...,"[{""run_id"": ""caaaf489d0464fd09131ad2a04ba0ff3""..."
1,df171021563446309749bb2bd6fc9857,1,FINISHED,file:///C:/Users/Anand/AI_Methodology/AI_METHO...,2022-07-30 15:20:07.996000+00:00,2022-07-30 15:20:32.769000+00:00,0.53619,0.2875,4,LOCAL,Anand,C:\Users\Anand\.conda\envs\aimethodology\lib\s...,"[{""run_id"": ""df171021563446309749bb2bd6fc9857""..."
2,e432be3cf0064e539753b6ee31a69c28,1,FINISHED,file:///C:/Users/Anand/AI_Methodology/AI_METHO...,2022-07-30 15:19:42.156000+00:00,2022-07-30 15:20:07.969000+00:00,0.48734,0.2375,2,LOCAL,Anand,C:\Users\Anand\.conda\envs\aimethodology\lib\s...,"[{""run_id"": ""e432be3cf0064e539753b6ee31a69c28""..."


In [16]:
max_depth = 4
mlflow.search_runs(
    experiment_id,
    filter_string=f"params.max_depth = '{max_depth}' AND metrics.testing_mse <= 40",
    order_by=['metrics.testing_mse asc']
)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.testing_rmse,metrics.testing_mse,params.max_depth,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.log-model.history
0,df171021563446309749bb2bd6fc9857,1,FINISHED,file:///C:/Users/Anand/AI_Methodology/AI_METHO...,2022-07-30 15:20:07.996000+00:00,2022-07-30 15:20:32.769000+00:00,0.53619,0.2875,4,LOCAL,Anand,C:\Users\Anand\.conda\envs\aimethodology\lib\s...,"[{""run_id"": ""df171021563446309749bb2bd6fc9857""..."


In [17]:
run = mlflow.search_runs(
    experiment_id,
    filter_string=f"params.max_depth = '{max_depth}' AND metrics.testing_mse <= 30",
    order_by=["metrics.testing_mse asc"]
).iloc[0]
run

run_id                                            df171021563446309749bb2bd6fc9857
experiment_id                                                                    1
status                                                                    FINISHED
artifact_uri                     file:///C:/Users/Anand/AI_Methodology/AI_METHO...
start_time                                        2022-07-30 15:20:07.996000+00:00
end_time                                          2022-07-30 15:20:32.769000+00:00
metrics.testing_rmse                                                       0.53619
metrics.testing_mse                                                         0.2875
params.max_depth                                                                 4
tags.mlflow.source.type                                                      LOCAL
tags.mlflow.user                                                             Anand
tags.mlflow.source.name          C:\Users\Anand\.conda\envs\aimethodology\lib\s...
tags

In [18]:
run.artifact_uri

'file:///C:/Users/Anand/AI_Methodology/AI_METHODOLY/MLFLOW/mlruns/1/df171021563446309749bb2bd6fc9857/artifacts'

In [19]:
model = mlflow.sklearn.load_model(model_uri=f"{run.artifact_uri}/model")
model

In [20]:
target = df_master['PerformanceRating']
train = df_master.drop(['PerformanceRating'], axis=1)
processedtrain= preprocess(train, MODEL_DIR)

In [21]:
model.predict(processedtrain)

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)