## Import Libraries

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from scipy.stats import randint, uniform


In [3]:
df = pd.read_csv("sales_data_50.csv") 

print(df.head())
print(df.info())
print(df.isnull().sum())


   ID    Product Region  Sales  Quantity  Discount  Profit Month
0   1     Eraser   West    316         3      0.08     451   May
1   2  Sharpener   East    863         1      0.15     252   Mar
2   3   Notebook   West    287         5      0.09    -164   Jun
3   4  Sharpener   East    479        14      0.09     -41   Mar
4   5  Sharpener   West    592         7      0.01    -192   Mar
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        50 non-null     int64  
 1   Product   50 non-null     object 
 2   Region    50 non-null     object 
 3   Sales     50 non-null     int64  
 4   Quantity  50 non-null     int64  
 5   Discount  50 non-null     float64
 6   Profit    50 non-null     int64  
 7   Month     50 non-null     object 
dtypes: float64(1), int64(4), object(3)
memory usage: 3.3+ KB
None
ID          0
Product     0
Region      0
Sales    

## Preprocessing

In [5]:
# Droping ID - not useful for prediction
df.drop("ID", axis=1, inplace=True)

for col in ['Product', 'Region', 'Month']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop("Profit", axis=1)
y = df["Profit"]


## Split & Scale

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## Evaluating Base Models

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print("MAE :", mean_absolute_error(y_test, y_pred))
    print("MSE :", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2  :", r2_score(y_test, y_pred))

for name, model in models.items():
    evaluate_model(name, model)



Linear Regression
MAE : 194.81403388895876
MSE : 46868.69362374702
RMSE: 216.49178650412358
R2  : 0.037993633851026276

Random Forest
MAE : 205.73399999999998
MSE : 47548.628006666666
RMSE: 218.05647893760613
R2  : 0.024037597222754337

SVR
MAE : 204.68034648586735
MSE : 50780.193971211425
RMSE: 225.3446115868126
R2  : -0.0422921164978618

KNN
MAE : 185.22666666666666
MSE : 41091.263999999996
RMSE: 202.70980242701634
R2  : 0.1565786348036936


### Evaluation Metrics Used

MAE (Mean Absolute Error): Measures average magnitude of errors without considering direction.

MSE (Mean Squared Error): Penalizes larger errors more than MAE.

RMSE (Root Mean Squared Error): Standard deviation of residuals (errors).

R² (Coefficient of Determination): Indicates how well the model explains variability in the target.

## GridSearchCV – Random Forest Regressor

In [21]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4]
}

grid_rf = GridSearchCV(RandomForestRegressor(), param_grid=param_grid,
                       cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best Params (GridSearchCV - RF):", grid_rf.best_params_)
y_pred = grid_rf.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2  :", r2_score(y_test, y_pred))


Best Params (GridSearchCV - RF): {'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 50}
RMSE: 211.34395377423056
R2  : 0.08319968643251807


## RandomizedSearchCV – Random Forest Regressor

In [23]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10)
}

random_rf = RandomizedSearchCV(RandomForestRegressor(), param_distributions=param_dist,
                               n_iter=10, cv=5, scoring='neg_mean_squared_error',
                               random_state=42, n_jobs=-1)
random_rf.fit(X_train, y_train)

print("Best Params (RandomizedSearchCV - RF):", random_rf.best_params_)
y_pred = random_rf.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2  :", r2_score(y_test, y_pred))


Best Params (RandomizedSearchCV - RF): {'max_depth': 23, 'min_samples_split': 8, 'n_estimators': 124}
RMSE: 218.77787443936936
R2  : 0.017569369340928453


## Hyperparameter Tuning Results

To improve the baseline performance of the Random Forest Regressor, two hyperparameter optimization techniques were applied:

GridSearchCV: Exhaustively tests combinations of parameters over a defined grid.

RandomizedSearchCV: Samples a fixed number of parameter combinations from a distribution.

Best Parameters Found for:
- GridSearchCV:	{'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 4}
- RandomizedSearchCV:	{'n_estimators': 124, 'max_depth': 23, 'min_samples_split': 8}

Tuned Model Performance:
	
- GridSearchCV
    - RMSE : 211.34	
    - R² Score : 0.0832
- RandomizedSearchCV	
    - RMSE : 218.78	
    - R² Score : 0.0176

GridSearchCV led to a moderate improvement in model performance compared to the untuned Random Forest (RMSE reduced from 218.06 to 211.34; R² increased from 0.024 to 0.083).

RandomizedSearchCV did not yield better results compared to GridSearchCV, possibly due to randomness in parameter selection or fewer iterations.

### Conclusion


The tuned model via GridSearchCV offered the best performance among all Random Forest configurations.

However, the improvement was marginal, indicating that while tuning helped, the overall predictability of the dataset remains low.

This further supports the earlier recommendation to explore additional features, more data, or advanced regression techniques (e.g., XGBoost, Gradient Boosting) to better model the underlying patterns.