In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor # the algo that I want to apply
from sklearn.model_selection import GridSearchCV
import seaborn as sns



In [2]:
df=sns.load_dataset('tips')

In [3]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
X=df.iloc[:,1:]#tell about the independent features(except total_bill all are idependent features) in the dataset
y=df['total_bill']

In [5]:
from sklearn.model_selection import train_test_split


In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [7]:
##pipelinig
numeric_processor=Pipeline(
    steps=[
        ("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),##let's I have numerical variable so I have to perform the imputation with replace the missing values with the mean of the variable so initilize simple imputer with strategy mean ,if you have outliers then use median
        ("scalar",StandardScaler())
    ]
)
#there are two steps -Imputation of missing value and another is feature scaling


In [8]:
categoric_processor=Pipeline(
    steps=[
        ("Imputation_constant",SimpleImputer(fill_value="missing",strategy="constant")),
        ("one_hot",OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [9]:
# 3RD COMBINE PROCESING TECHNIQUES(NUMERICAL +CATEGORICAL)
from sklearn.compose import ColumnTransformer

In [10]:
Combine_processor=ColumnTransformer(#mention all the steps
    [("categorical",categoric_processor,["sex","smoker","day","time"]),# in the column initilize the column name when apply categorical processor in combination
    ("numerical",numeric_processor,["tip","size"])]# here in the column initilize the numerical column name when numerical processor in combination

)
Combine_processor

In [11]:
pipe=Pipeline(
    steps=[("Combine_processor",Combine_processor),
           ("regressor",RandomForestRegressor())]
)
pipe


In [12]:
from sklearn import set_config
set_config(display='diagram')
pipe

In [13]:
pipe.fit(X_train,y_train)

In [14]:
#predict
pipe.predict(X_test)

array([29.25546667, 20.2658    , 18.9824    , 20.7639    , 17.5412    ,
       19.1105    , 20.5721    , 13.99685952, 18.99126667, 14.53732   ,
       13.98015667, 18.99126667, 19.5151    , 25.28646667, 15.0938275 ,
       24.6469    ,  9.19928333, 17.19383333, 12.84500833, 17.7984    ,
       36.63695333, 25.5847    , 12.6717    , 16.6397    , 24.4972    ,
       18.7705    , 13.02741437, 12.03066167, 14.38856667, 29.232     ,
       12.2907    , 21.7624    , 25.08454   , 17.19916667, 13.98015667,
       21.8961    , 14.788635  , 13.5366    , 17.1651    , 21.87982   ,
        9.19928333, 10.953195  , 16.697     , 18.5768    , 16.6870375 ,
       14.687     , 15.53210667, 18.5498    , 31.6983    ])

# Hyper parameter tuning

Hyper parameter :These are parameters that are not learned by the model during training, but are set before training begins. They control the learning process and model behavior, such as the learning rate, regularization strength, or number of trees in a random forest.

Hyper parameter tuning: Hyperparameter tuning in machine learning is the process of finding the best set of hyperparameters for a machine learning model.

Grid Search:
It explores a predefined set of hyperparameter values, creating a "grid" of all possible combinations.

Cross-Validation:
This technique helps estimate how well the model will generalize to unseen data by splitting the data into multiple folds (e.g., k-fold cross-validation), training on some folds, and testing on the remaining fold, repeating this process for each fold. Average the Results: Finally, you average the results from each test to get a more accurate idea of how well your model will perform in the real world.


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:

# AS i don't give any parameter to random forest regresssor , it will use default parameters check all the parameter "https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"
param_grid={
    "regressor__n_estimators":[200,500],
    "regressor__max_features":["auto","sqrt","log2"],
    "regressor__max_depth":[4,5,6,7,8]
}

#In hyperparameter tuning (e.g., using GridSearchCV or RandomizedSearchCV), n_jobs controls how many parallel processes (or CPU cores) are used for tasks like fitting and evaluating models. Setting it to a positive integer utilizes that many cores, while setting it to -1 uses all available cores. This speeds up the tuning process, especially for large datasets or complex models.


n_jobs=-1
Meaning: Use all available CPU cores or processors for parallel processing.
Effect: This setting attempts to maximize the speed of the hyperparameter search by distributing tasks across all available computing resources.
Considerations:
While often the fastest, it can sometimes lead to issues if the dataset is very large and the machine runs out of memory, as each process might try to copy the dataset.
In some situations, excessive parallelization can introduce overhead and negatively impact performance, notes Scikit-learn.


n_jobs=1
Meaning: Run the tasks sequentially on a single CPU core or processor.
Effect: This is essentially turning off parallelization, and the tuning process will execute one task at a time.
When to Use:
For debugging purposes, to isolate issues that might arise from parallel execution.
If you have a limited number of CPU cores and don't want the overhead associated with managing parallel processes, especially for smaller datasets or simple models, according to Number Analytics.
If memory is a major constraint, as parallel processes can require more memory.
In summary, n_jobs=-1 aims for maximum speed by utilizing all CPU cores, while n_jobs=1 opts for sequential execution, potentially at the cost of speed but with reduced overhead and memory usage. It's often recommended to start with n_jobs=-1 and then adjust if performance or memory issues arise.

In [17]:
#the tecnique of hyperparameter tuning-Gridsearchcv
#from sklearn.model_selection import RandomizedSearchCV
grid_search=GridSearchCV(pipe,param_grid,n_jobs=-1)

In [18]:
grid_search.fit(X_train,y_train)

In [19]:
#use RandomSearchCv for tuning
from sklearn.model_selection import RandomizedSearchCV
Random_search=RandomizedSearchCV(pipe,param_grid,n_jobs=1)

In [20]:
Random_search.fit(X_train,y_train)

In [21]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 500}

In [24]:
#using best params in pipe
pipe=Pipeline(
    steps=[("Combine_processor",Combine_processor),
           ("regressor",RandomForestRegressor(max_depth=6,max_features='sqrt',n_estimators= 500))]
)



In [25]:
#now again fit
pipe.fit(X_train,y_train)

In [27]:
pipe.predict(X_test)

array([25.08637906, 20.67169453, 17.82699104, 20.10018137, 17.52410249,
       19.33579357, 20.35592849, 15.42893975, 17.32974497, 15.51503059,
       15.86366885, 17.32974497, 17.66770617, 25.58520126, 15.30452931,
       24.90254469, 14.1518357 , 18.65198324, 11.85640827, 18.7924186 ,
       29.28327575, 20.82068307, 12.80165883, 16.94119199, 23.09490134,
       17.40127186, 12.71976043, 13.65505936, 13.59771953, 28.78139975,
       14.33962031, 21.701294  , 24.41206202, 17.46295821, 15.86366885,
       19.76658983, 13.97557338, 15.87650412, 18.36863072, 21.76809914,
       14.1950157 , 13.74842194, 17.04132326, 19.14567718, 17.11193486,
       15.13803253, 13.52683696, 21.13265935, 28.51993845])

After this use R square and probably find how the model performance