In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
df=pd.read_csv(r"C:\Users\archa\OneDrive\Desktop\internship\Task1\Online Sales Data.csv",encoding='latin1')
df

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
0,10001,2024-01-01,Electronics,iPhone 14 Pro,2,999.99,1999.98,North America,Credit Card
1,10002,2024-01-02,Home Appliances,Dyson V11 Vacuum,1,499.99,499.99,Europe,PayPal
2,10003,2024-01-03,Clothing,Levi's 501 Jeans,3,69.99,209.97,Asia,Debit Card
3,10004,2024-01-04,Books,The Da Vinci Code,4,15.99,63.96,North America,Credit Card
4,10005,2024-01-05,Beauty Products,Neutrogena Skincare Set,1,89.99,89.99,Europe,PayPal
...,...,...,...,...,...,...,...,...,...
235,10236,2024-08-23,Home Appliances,Nespresso Vertuo Next Coffee and Espresso Maker,1,159.99,159.99,Europe,PayPal
236,10237,2024-08-24,Clothing,Nike Air Force 1 Sneakers,3,90.00,270.00,Asia,Debit Card
237,10238,2024-08-25,Books,The Handmaid's Tale by Margaret Atwood,3,10.99,32.97,North America,Credit Card
238,10239,2024-08-26,Beauty Products,Sunday Riley Luna Sleeping Night Oil,1,55.00,55.00,Europe,PayPal


### Data preprocessing

In [2]:
df.isnull().sum()

Transaction ID      0
Date                0
Product Category    0
Product Name        0
Units Sold          0
Unit Price          0
Total Revenue       0
Region              0
Payment Method      0
dtype: int64

In [3]:
df.drop(['Transaction ID','Date','Product Name'],axis=1,inplace=True)
df

Unnamed: 0,Product Category,Units Sold,Unit Price,Total Revenue,Region,Payment Method
0,Electronics,2,999.99,1999.98,North America,Credit Card
1,Home Appliances,1,499.99,499.99,Europe,PayPal
2,Clothing,3,69.99,209.97,Asia,Debit Card
3,Books,4,15.99,63.96,North America,Credit Card
4,Beauty Products,1,89.99,89.99,Europe,PayPal
...,...,...,...,...,...,...
235,Home Appliances,1,159.99,159.99,Europe,PayPal
236,Clothing,3,90.00,270.00,Asia,Debit Card
237,Books,3,10.99,32.97,North America,Credit Card
238,Beauty Products,1,55.00,55.00,Europe,PayPal


In [4]:
x=df.drop('Total Revenue',axis=1)
y=df['Total Revenue']

### Feature engineering

In [5]:
# Defining numeric and categorical features
numeric_features=['Units Sold','Unit Price']
categorical_features=['Product Category','Region','Payment Method']

### Feature extraction

In [6]:
# processed by imputing missing values with the mean and then scaling them
numeric_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
])
numeric_transformer

In [7]:
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
categorical_transformer

In [8]:
preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,numeric_features),
    ('cat',categorical_transformer,categorical_features)
])
preprocessor

In [9]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=42,test_size=0.2)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(192, 5)
(192,)
(48, 5)
(48,)


In [10]:
xtrain_processed = preprocessor.fit_transform(xtrain)
xtest_processed = preprocessor.transform(xtest)

### Model selection

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

In [12]:
# determine which one performs the best on your dataset
models={
    'Linear Regression':LinearRegression(),
    'Decision tree':DecisionTreeRegressor(random_state=42),
    'Random forest':RandomForestRegressor(random_state=42),
    'Gradient boosting':GradientBoostingRegressor(random_state=42)
}

In [13]:
# evaluate each model using cross-validation
for name, model in models.items():
    pipeline=Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ])
    cv_score=cross_val_score(pipeline,x,y,cv=5,scoring='neg_mean_squared_error')
    print(f"{name}:Mean Squared Error = {-cv_score.mean()}")
# Gradient Boosting model has the lowest mean squared error (MSE)

Linear Regression:Mean Squared Error = 29228.178172027354
Decision tree:Mean Squared Error = 17914.565576666668
Random forest:Mean Squared Error = 25025.911926117653
Gradient boosting:Mean Squared Error = 15166.364615783867


### Model training

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])

In [15]:
best_model.fit(xtrain,ytrain)

### Model evaluation

In [16]:
from sklearn.metrics import mean_squared_error
y_pred=best_model.predict(xtest)
mse=mean_squared_error(ytest,y_pred)
print(f"Mean Squared error on test data: {mse}")

Mean Squared error on test data: 1025.6154657062007


In [17]:
from sklearn.metrics import r2_score
r2 = r2_score(ytest, y_pred)
print(f"R² Score on test data: {r2}")

R² Score on test data: 0.994386450929086


In [18]:
from sklearn.metrics import mean_squared_error
import numpy as np
# Assuming ytest and ytest_pred are your actual and predicted values for the test set
test_mse = mean_squared_error(ytest, y_pred)
print(f"Mean Squared Error on test data: {test_mse}")

test_rmse = np.sqrt(test_mse)
print(f"Root Mean Squared Error on test data: {test_rmse}")


Mean Squared Error on test data: 1025.6154657062007
Root Mean Squared Error on test data: 32.02523170417664


### Model deployment