In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


In [None]:
# Load the dataset
data_path = r"C:\Users\subra\Desktop\Projects\New Projects\Sales_Performance_Analysis\data\sales_data.csv"
df = pd.read_csv(data_path)

In [None]:
# Display basic information about the dataset
print("First five rows of the dataset:")
print(df.head())

print("\nSummary statistics of the dataset:")
print(df.describe())

print("\nDataset information:")
print(df.info())

In [None]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


In [None]:
# Handle missing values (fill with median for numerical columns or drop if necessary)
df['Sales'].fillna(df['Sales'].median(), inplace=True)  # Fill missing sales data with median
df['Profit'].fillna(df['Profit'].median(), inplace=True)  # Fill missing profit data with median
df['Quantity'].fillna(df['Quantity'].median(), inplace=True)  # Fill missing quantity with median
df['Region'].fillna(df['Region'].mode()[0], inplace=True)  # Fill missing region with mode

In [None]:
# Exploratory Data Analysis (EDA)
# Distribution of sales
plt.figure(figsize=(8, 6))
sns.histplot(df['Sales'], kde=True, bins=30, color='blue')
plt.title("Distribution of Sales")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Distribution of profit
plt.figure(figsize=(8, 6))
sns.histplot(df['Profit'], kde=True, bins=30, color='green')
plt.title("Distribution of Profit")
plt.xlabel("Profit")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Sales vs Profit
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['Sales'], y=df['Profit'], alpha=0.7, color='red')
plt.title("Sales vs Profit")
plt.xlabel("Sales")
plt.ylabel("Profit")
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap of Features")
plt.show()

In [None]:
# Feature and target selection
X = df[['Sales', 'Quantity', 'Region']]  # Features
y = df['Profit']  # Target (Profit)

In [None]:
# Convert categorical variables to numeric
X = pd.get_dummies(X, columns=['Region'], drop_first=True)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the model using Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation Metrics:\nMean Squared Error: {mse:.2f}\nMean Absolute Error: {mae:.2f}\nR^2 Score: {r2:.2f}")

In [None]:
# Visualize predicted vs actual profit
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7, color='orange')
plt.title("Predicted vs Actual Profit")
plt.xlabel("Actual Profit")
plt.ylabel("Predicted Profit")
plt.show()