In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

def load_data(path : str | Path = r"C:\Users\DELL\OneDrive\Desktop\anu course\datasets\Supply_chain.csv"):
    path = Path(path)

    df=pd.read_csv(path)
    x=df.drop(columns="TotalRevenue",axis=1)
    y=df["TotalRevenue"]

    return x,y

In [4]:
"df=pd.read_csv( r"C:\Users\DELL\OneDrive\Desktop\anu course\datasets\Supply_chain.csv")
df.columns.tolist()

['WorkDate',
 'Customer',
 'Location',
 'BusinessType',
 'OrderCount',
 'NumberOfPieces',
 'TotalRevenue']

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

def build_preprocessor(x):
    num_cols = x.select_dtypes(include=["int64","float64"]).columns
    cat_cols = x.select_dtypes(include=["object"]).columns

    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(transformers=[("num",num_transformer,num_cols),("cat",cat_transformer,cat_cols)])

    return preprocessor , num_cols ,cat_cols

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor 

def build_regression_pipeline(preprocessor):
    model = RandomForestRegressor(n_estimators=100,max_depth=None,random_state=42)

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    return pipeline

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_squared_error,root_mean_squared_error,classification_report

def main(mode=RandomForestRegressor):
    x,y=load_data()
    preprocessor,num_cols,cat_cols = build_preprocessor(x)

    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.33)

    model = build_regression_pipeline(preprocessor)

    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print("MSE : " , (mean_squared_error(y_test,y_pred)))
    print("RMSE : " , (root_mean_squared_error(y_test,y_pred)))
    print("R2 : " , (r2_score(y_test,y_pred)))

In [17]:
if __name__ == '__main__':
    main(mode=RandomForestRegressor)

MSE :  11776.732189801227
RMSE :  108.52065328683396
R2 :  0.9991629015431265
