In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the dataset
df = pd.read_csv("../data/synthetic_sales_dataset.csv")

# Quick look at data
df.head()


In [None]:
# Shape of dataset
print(df.shape)

# Columns
print(df.columns)

# Check for missing values
print(df.isnull().sum())

# Summary statistics
print(df.describe())


In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Group sales by date
daily_sales = df.groupby('date')['total_sales'].sum()

# Plot
plt.figure(figsize=(12,5))
daily_sales.plot()
plt.title("Daily Total Sales")
plt.xlabel("Date")
plt.ylabel("Total Sales")
plt.show()


In [None]:
#Top-Selling Products
# Total sales per product
product_sales = df.groupby('product_name')['total_sales'].sum().sort_values(ascending=False)

# Plot top 10 products
plt.figure(figsize=(10,5))
sns.barplot(x=product_sales.head(10).index, y=product_sales.head(10).values, palette="viridis")
plt.xticks(rotation=45)
plt.title("Top 10 Products by Total Sales")
plt.ylabel("Total Sales")
plt.xlabel("Product")
plt.show()


In [None]:
#Sales by Category
category_sales = df.groupby('category')['total_sales'].sum()

# Plot
plt.figure(figsize=(8,4))
sns.barplot(x=category_sales.index, y=category_sales.values, palette="coolwarm")
plt.title("Total Sales by Category")
plt.ylabel("Total Sales")
plt.xlabel("Category")
plt.show()





In [None]:
#Quantity Sold Distribution
plt.figure(figsize=(8,4))
sns.histplot(df['quantity_sold'], bins=20, kde=False, color='skyblue')
plt.title("Distribution of Quantity Sold")
plt.xlabel("Quantity Sold")
plt.ylabel("Count")
plt.show()


In [None]:
#Total Sales Distribution
plt.figure(figsize=(8,4))
sns.histplot(df['total_sales'], bins=30, kde=True, color='orange')
plt.title("Distribution of Total Sales per Transaction")
plt.xlabel("Total Sales")
plt.ylabel("Count")
plt.show()


In [None]:
#Trend by Category Over Time
category_time = df.groupby(['date','category'])['total_sales'].sum().unstack()

plt.figure(figsize=(12,6))
category_time.plot()
plt.title("Daily Sales by Category")
plt.xlabel("Date")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Make a copy
data = df.copy()

# --- DATE FEATURES ---
data['day'] = data['date'].dt.day
data['month'] = data['date'].dt.month
data['day_of_week'] = data['date'].dt.dayofweek  # Monday=0, Sunday=6
data['is_weekend'] = data['day_of_week'].isin([5,6]).astype(int)
data['week_number'] = data['date'].dt.isocalendar().week.astype(int)

# --- CATEGORY ENCODING ---
data = pd.get_dummies(data, columns=['category'], drop_first=True)

# --- DROP UNNEEDED COLUMNS ---
data = data.drop(columns=['product_name', 'date'])

# Preview
data.head()


In [None]:
# Sort dataset by date to avoid leakage
df = df.sort_values(by="date")

# Define train (Jan–May) and test (June)
train_df = df[df['date'] < "2024-06-01"]
test_df = df[df['date'] >= "2024-06-01"]

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

# Select features and target
X_train = train_df[['product_id', 'price', 'quantity_sold']]
y_train = train_df['total_sales']

X_test = test_df[['product_id', 'price', 'quantity_sold']]
y_test = test_df['total_sales']


In [None]:

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

print("Model training complete.")


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

# Predict on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MODEL EVALUATION RESULTS")
print("-------------------------")
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

# Plot Actual vs Predicted
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred, alpha=0.4)
plt.title("Actual vs Predicted Sales")
plt.xlabel("Actual Total Sales")
plt.ylabel("Predicted Total Sales")
plt.show()


In [None]:
# Step 8b: Visualize Actual vs Predicted Sales
# Generate predictions
predictions = lr_model.predict(X_test_scaled)

# Visualize Actual vs Predicted Sales
plt.figure(figsize=(7,7))
plt.scatter(y_test, predictions, alpha=0.5, color='purple')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='black')
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.title("Actual vs Predicted Scatter Plot")
plt.show()
