In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

## Step 1: Load preprocessed training and testing data

In [2]:
X_train = pd.read_csv("X_train_preprocessed.csv")
X_test = pd.read_csv("X_test_preprocessed.csv")
y_train = pd.read_csv("y_train_preprocessed.csv")
y_test = pd.read_csv("y_test_preprocessed.csv")

## Ensure y_train and y_test are in the correct format (1D array)

In [3]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

## Step 1: Fit the Linear Regression model

In [4]:
X_train

Unnamed: 0,ProductID,Price,Rating,NumReviews,StockQuantity,Discount,Year,Month,ProductName_Ankle Socks,ProductName_Art Print,...,City_St. Louis,City_Staten Island,City_Tampa,City_Toledo,City_Tucson,City_Tulsa,City_Virginia Beach,City_Washington,City_Wichita,City_Worcester
0,57,0.012272,0.151596,1.444262,-1.641518,-1.445812,2023,9,False,False,...,False,False,False,False,False,False,False,False,False,False
1,956,-1.247307,-1.586888,-0.704784,0.893909,-1.514127,2024,5,False,False,...,False,False,False,False,False,False,False,False,False,False
2,232,-0.488036,0.064672,-0.100343,0.002067,0.671943,2024,1,False,False,...,False,True,False,False,False,False,False,False,False,False
3,739,-0.295016,-1.065343,1.293835,-1.313485,-1.240868,2023,7,False,False,...,False,False,False,False,False,False,False,False,False,False
4,741,-0.284262,1.368534,0.067177,1.385960,-1.650756,2023,7,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,242,1.740108,-1.586888,0.501362,0.630799,1.696664,2024,3,False,False,...,False,False,False,False,False,False,False,False,False,False
796,254,1.129139,0.064672,-0.829227,-0.496817,1.218461,2024,1,False,False,...,False,False,False,False,False,False,False,False,False,False
797,391,-0.340229,1.368534,1.483919,-1.528757,-0.967609,2023,9,False,False,...,False,False,False,False,False,False,False,False,False,False
798,668,-0.995632,0.933913,-0.935209,0.118246,-1.309183,2023,8,False,False,...,False,False,False,False,False,False,False,False,False,False


<div class="alert alert-block alert-info" style="font-size: 25px; font-weight: bold;">
    <p>LinearRegression formula:</p>
    <p>Y= $β_{o}$ + $β_{1}$X - ε </p>
    where $β_{o}$ is intercept , $β_{1}$ is cofficient and (ε) is the error which accurs due to absence of some related features 
    
</div>

In [5]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [6]:
intercept = linear_model.intercept_   # generating the intercepts β𝑜
intercept

-2477464617.508099

In [7]:
slope = linear_model.coef_
slope

array([ 2.33332888e-03,  7.93782494e-02,  3.99583447e-02,  7.19779058e-02,
        1.82233261e-02,  3.92987102e-02, -8.86021239e-02, -8.47105500e-03,
        3.89786036e+09,  3.92906539e+09, -8.71013792e+09,  1.12122756e+09,
        2.94557218e+09,  8.33169085e+08, -3.64169814e+09,  4.07898415e+09,
       -6.77563727e-01,  1.12122756e+09,  1.12122756e+09, -3.61257759e+09,
        4.80145077e+09, -4.86954303e+09,  4.80145077e+09,  2.47746480e+09,
        2.47746480e+09,  3.92906539e+09, -4.86954303e+09,  9.57013809e+09,
       -3.88717771e+09, -3.64169814e+09, -1.70802970e+09, -8.71013792e+09,
        3.89786036e+09, -8.63461933e+09, -3.84266702e+09,  3.92906539e+09,
       -4.90945194e+08,  9.57013809e+09, -3.61257759e+09, -3.61257759e+09,
       -4.92000875e+09,  9.22289599e+09,  8.33169085e+08, -1.14830442e+09,
        9.22289599e+09,  4.07898415e+09,  3.92906539e+09, -4.90945194e+08,
       -4.90945194e+08,  9.40073871e+09, -4.92000875e+09,  4.50640184e+08,
        4.50640184e+08,  

## Step 2: Fit the Random Forest Regressor model

<div class="alert alert-block alert-info" style="font-size: 25px; font-weight: bold;">
    <p>Random Forest formula:</p>
    <p>Y = $\frac{1}{N}\sum_{i=1}^{N} f_i(X)$</p>
    where $f_i(X)$ is the prediction of the $i^{th}$ decision tree in the forest, and $N$ is the total number of trees in the forest.
</div>

In [8]:
rf_model = RandomForestRegressor(random_state=12)
rf_model.fit(X_train, y_train)

In [9]:
feature_importances = rf_model.feature_importances_
feature_importances

array([0.09789925, 0.11278723, 0.08307052, 0.11712795, 0.0981441 ,
       0.08811661, 0.00581372, 0.0566346 , 0.00098191, 0.00027909,
       0.00068188, 0.00051413, 0.00142314, 0.00162071, 0.00091216,
       0.00041734, 0.00144608, 0.00094297, 0.00139812, 0.00077709,
       0.00217232, 0.00121367, 0.00279308, 0.00142792, 0.00225414,
       0.00108612, 0.00106908, 0.00134651, 0.00105487, 0.00146922,
       0.0006932 , 0.00176598, 0.00022112, 0.00024055, 0.00199491,
       0.00051931, 0.00179758, 0.00119233, 0.00169602, 0.00052069,
       0.00079336, 0.00045715, 0.0002813 , 0.00127752, 0.00076696,
       0.00086798, 0.00216903, 0.00143458, 0.00082595, 0.00072954,
       0.0005899 , 0.00111582, 0.00038278, 0.00084669, 0.00188962,
       0.00089653, 0.00294213, 0.00169561, 0.0035558 , 0.00105932,
       0.00065288, 0.00096379, 0.00042501, 0.00059777, 0.00091921,
       0.0012854 , 0.00040242, 0.00230013, 0.00491967, 0.00178999,
       0.00022965, 0.00126994, 0.00020814, 0.0003359 , 0.00071

## Step 3: Fit the Support Vector Regressor model

<div class="alert alert-block alert-info" style="font-size: 25px; font-weight: bold;">
    <p>Support Vector Regressor formula:</p>
    <p>Y = $f(X) = \sum_{i=1}^{n} \alpha_i K(X, X_i) + b$</p>
    where $K(X, X_i)$ is the kernel function, $\alpha_i$ are the Lagrange multipliers, $b$ is the bias term, and $n$ is the number of support vectors.
</div>

In [10]:
svr_model = SVR(kernel="linear")
svr_model.fit(X_train, y_train)

In [11]:
svr_coef = svr_model.coef_
svr_coef

array([[-6.24253919e-04, -7.96445349e-02, -2.04086339e-01,
         2.80930932e-03,  1.73242099e-01, -1.16952894e-01,
        -7.57379957e-01, -1.30793083e-01, -3.60207036e-01,
         1.00000000e+00, -1.25202168e-01,  1.41916753e+00,
         1.00000000e+00, -1.86826526e-01, -1.54165109e-01,
        -8.33372797e-01, -2.86958218e+00,  0.00000000e+00,
        -2.00000000e+00, -2.00000000e+00,  0.00000000e+00,
         5.68077825e-01,  1.48857166e-01, -7.77064564e-01,
        -1.00000000e+00,  2.21966773e-01,  0.00000000e+00,
         1.00000000e+00,  1.12095080e+00, -7.18307127e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.54952682e+00, -5.54339011e-01,  0.00000000e+00,
         1.00000000e+00, -6.60328444e-02, -4.19645532e-01,
         2.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  1.06370080e-01,  1.00000000e+00,
        -1.00000000e+00, -1.00000000e+00, -1.00000000e+00,
        -1.00000000e+00, -4.09102853e-01,  0.00000000e+0

In [12]:
svr_intercept = svr_model.intercept_
svr_intercept

array([1533.55651445])

## Step 4: Model Evaluation

In [13]:
models = {'Linear Regression': linear_model, 'Random Forest': rf_model, 'SVR': svr_model}

In [14]:
# Store metrics
results = {}

for model_name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {'MSE': mse, 'R2': r2}


In [15]:
# Display the results
results_df = pd.DataFrame(results).T
print(results_df)

                        MSE        R2
Linear Regression  1.403132 -0.456589
Random Forest      0.980668 -0.018030
SVR                3.593633 -2.730545


In [16]:
# Identify the best model based on R2 score
best_model_name = results_df['R2'].idxmax()
best_model = models[best_model_name]

<div class="alert alert-block alert-info" style="font-size: 25px; font-weight: bold;">
    <p>note:</p>
    <p>R2 score, which can range from -∞ to 1. A higher R2 score indicates a better fit of the model to the data.</p>
    An R2 score of 1 means a perfect fit.
</div>

In [17]:
print(f'The best model is: {best_model_name} with R2 Score: {results_df.loc[best_model_name, "R2"]}')

The best model is: Random Forest with R2 Score: -0.018030222619756398


## 9. Conclusion

We have considered various machine learning models. We have engineered several features from the provided data, including:

#### Categorical Features: ProductName, Category, City, Month
#### Numerical Features: Price, Rating, NumReviews, StockQuantity, Discount

This is a regression problem, where we aim to predict a continuous numerical value (Sales). We have employed the following models:

### Linear Regression
### Random Forest Regression
### Support Vector Regression (SVR)

To evaluate the performance of each model, we have split the dataset into training and validation sets. We have assessed the models based on Mean Squared Error (MSE) and R-squared (R²) metrics. The R² score indicates the proportion of variance in the dependent variable explained by the independent variables.

After comparing the performance metrics, we have determined that **Random Forest Regression** is the best-performing model for our dataset. It achieves a lower MSE and a higher R² score compared to the other models. This suggests that Random Forest is better able to capture the complex relationships between the features and the target variable.