In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [12]:
# Load the uploaded CSV file
file_path = '/content/sales_data_sample.csv'
sales_data = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
sales_data.head()


Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/03 0:00,Shipped,1,2,2003,Motorcycles,95,S10_1678,Small
1,10121,34,81.35,5,2765.9,5/7/03 0:00,Shipped,2,5,2003,Motorcycles,95,S10_1678,Small
2,10134,41,94.74,2,3884.34,7/1/03 0:00,Shipped,3,7,2003,Motorcycles,95,S10_1678,Medium
3,10145,45,83.26,6,3746.7,8/25/03 0:00,Shipped,3,8,2003,Motorcycles,95,S10_1678,Medium
4,10159,49,100.0,14,5205.27,10/10/03 0:00,Shipped,4,10,2003,Motorcycles,95,S10_1678,Medium


In [13]:
# Convert the ORDERDATE to a proper datetime format
sales_data['ORDERDATE'] = pd.to_datetime(sales_data['ORDERDATE'], format='%m/%d/%y %H:%M')

# Encode categorical variables such as PRODUCTLINE, STATUS, DEALSIZE, and PRODUCTCODE using one-hot encoding
sales_data_encoded = pd.get_dummies(sales_data, columns=['PRODUCTLINE', 'STATUS', 'DEALSIZE', 'PRODUCTCODE'], drop_first=True)

# Drop unnecessary columns that might not contribute to sales forecasting, such as ORDERNUMBER and ORDERLINENUMBER
sales_data_encoded = sales_data_encoded.drop(['ORDERNUMBER', 'ORDERLINENUMBER'], axis=1)

# Check for any missing values
sales_data_encoded.isnull().sum()

# Display the first few rows of the processed data to check its state
sales_data_encoded.head()

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,ORDERDATE,QTR_ID,MONTH_ID,YEAR_ID,MSRP,PRODUCTLINE_Motorcycles,PRODUCTLINE_Planes,...,PRODUCTCODE_S700_2466,PRODUCTCODE_S700_2610,PRODUCTCODE_S700_2824,PRODUCTCODE_S700_2834,PRODUCTCODE_S700_3167,PRODUCTCODE_S700_3505,PRODUCTCODE_S700_3962,PRODUCTCODE_S700_4002,PRODUCTCODE_S72_1253,PRODUCTCODE_S72_3212
0,30,95.7,2871.0,2003-02-24,1,2,2003,95,True,False,...,False,False,False,False,False,False,False,False,False,False
1,34,81.35,2765.9,2003-05-07,2,5,2003,95,True,False,...,False,False,False,False,False,False,False,False,False,False
2,41,94.74,3884.34,2003-07-01,3,7,2003,95,True,False,...,False,False,False,False,False,False,False,False,False,False
3,45,83.26,3746.7,2003-08-25,3,8,2003,95,True,False,...,False,False,False,False,False,False,False,False,False,False
4,49,100.0,5205.27,2003-10-10,4,10,2003,95,True,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Define features (X) and target (y)
X = sales_data_encoded.drop(['SALES', 'ORDERDATE'], axis=1)
y = sales_data_encoded['SALES']

In [15]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=55)

In [16]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=200, random_state=55)
linear_model = LinearRegression()
gbr_model = GradientBoostingRegressor(random_state=55)
xgb_model = xgb.XGBRegressor(random_state=55, n_estimators=200)

In [17]:
# Train the model
rf_model.fit(X_train, y_train)
linear_model.fit(X_train, y_train)
gbr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

In [18]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)
y_pred_linear = linear_model.predict(X_test)
y_pred_gbr = gbr_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

In [19]:
# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae_linear = mean_absolute_error(y_test, y_pred_linear)
rmse_linear = mean_squared_error(y_test, y_pred_linear) ** 0.5
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
rmse_gbr = mean_squared_error(y_test, y_pred_gbr) ** 0.5
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb) ** 0.5

In [20]:
# Display results of all models
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Gradient Boosting', 'XGBoost', 'Random Forest'],
    'MAE': [mae_linear, mae_gbr, mae_xgb, mae],
    'RMSE': [rmse_linear, rmse_gbr, rmse_xgb, rmse]
})

print(results)

               Model         MAE        RMSE
0  Linear Regression  409.893022  589.020440
1  Gradient Boosting  281.472513  469.924710
2            XGBoost  285.465198  509.979426
3      Random Forest  239.124592  460.763617
