In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('C:/Users/HP/Documents/streamlit/datasets/Electronic.csv')
df.head()

Unnamed: 0,Age,Items Purchased,Total Spent,Discount (%),Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Revenue,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time
0,56,1,29.226195,47.07738,1.0,1,Male,South,Accessories,UPI,149.252145,3.660461,3.597133,1,Evening
1,69,10,420.142612,7.985739,3.760294,1,Female,South,Accessories,Cash,1485.524222,3.551553,25.764903,1,Evening
2,46,4,127.742817,37.225718,1.77124,1,Male,East,Laptop,Credit Card,85.550131,3.922839,7.022399,1,Morning
3,32,9,417.722683,8.227732,1.926831,0,Female,East,Tablet,UPI,824.118724,3.860422,7.635412,1,Afternoon
4,60,13,608.031366,5.0,3.902927,0,Female,South,Tablet,UPI,2463.590392,3.81282,29.461119,0,Morning


In [3]:
X = df.drop('Revenue',axis=1)
y = df['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   5000 non-null   int64  
 1   Items Purchased       5000 non-null   int64  
 2   Total Spent           5000 non-null   float64
 3   Discount (%)          5000 non-null   float64
 4   Satisfaction Score    5000 non-null   float64
 5   Warranty Extension    5000 non-null   int64  
 6   Gender                5000 non-null   object 
 7   Region                5000 non-null   object 
 8   Product Category      5000 non-null   object 
 9   Payment Method        5000 non-null   object 
 10  Store Rating          5000 non-null   float64
 11  Loyalty Score         5000 non-null   float64
 12  Membership Status     5000 non-null   int64  
 13  Preferred Visit Time  5000 non-null   object 
dtypes: float64(5), int64(4), object(5)
memory usage: 547.0+ KB


In [5]:
DROP_COLS = ['Age', 'Store Rating', 'Discount (%)', 'Loyalty Score']
def drop_columns(X):
    return X.drop(columns=DROP_COLS,errors='ignore')

dropper = FunctionTransformer(drop_columns)

In [6]:
X_train.select_dtypes(include='number').columns.difference(DROP_COLS).tolist()

['Items Purchased',
 'Membership Status',
 'Satisfaction Score',
 'Total Spent',
 'Warranty Extension']

In [7]:
num_cols = (
    X_train
    .select_dtypes(include='number')
    .columns
    .difference(DROP_COLS)
    .tolist()
)

cat_cols = (
    X_train
    .select_dtypes(exclude='number')
    .columns
    .difference(DROP_COLS)
    .tolist()
)

In [8]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])



final_pipeline = Pipeline([
    ("drop_cols", dropper),
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])


In [9]:
param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 5, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

random_search = RandomizedSearchCV(
    estimator=final_pipeline,
    param_distributions=param_grid,
    n_iter=30,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.score(X_train, y_train))
print(random_search.score(X_test, y_test))

{'model__n_estimators': 300, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
0.9819637203782504
0.9900019293083913
0.9827050769852661


In [10]:
import pickle

with open("C:/Users/HP/Documents/streamlit/models/rf_pipeline.pkl", "wb") as f:
    pickle.dump(random_search.best_estimator_, f)