In [5]:
# Matplotlib and inline plotting settings
import matplotlib
matplotlib.use('Agg')  # Use Agg backend for matplotlib backend

%matplotlib inline

import matplotlib.pyplot as plt
plt.switch_backend('Agg')  # Switch backend if only plt is imported

import seaborn as sns
import pandas as pd
import numpy as np

# Scikit-Learn imports for predictive modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance

# Set Seaborn style for better visuals
sns.set(style='whitegrid')

In [10]:
data_path = '/content/amazon_sales_2025_INR_cleaned.csv'
df = pd.read_csv(data_path, encoding='UTF-8-SIG')

# Display the first few rows of the dataframe to confirm successful loading
df.head()

Unnamed: 0,Report_Section,Dimension,Metric1,Metric2,Metric3,Metric4
0,AVERAGE_ORDER_VALUE,Overall Statistics,74544.12,204.050003,249955.5,
1,CATEGORY_PERFORMANCE,Beauty,227489600.0,2997.0,,
2,CATEGORY_PERFORMANCE,Electronics,226564900.0,3036.0,,
3,CATEGORY_PERFORMANCE,Books,224999200.0,3035.0,,
4,CATEGORY_PERFORMANCE,Clothing,222409300.0,3022.0,,


In [11]:
# Display information about the dataframe
print('DataFrame Information:')
df.info()

# Generate descriptive statistics for numeric columns
print('\nDescriptive Statistics:')
df.describe()


DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Report_Section  189 non-null    object 
 1   Dimension       189 non-null    object 
 2   Metric1         189 non-null    float64
 3   Metric2         189 non-null    float64
 4   Metric3         60 non-null     float64
 5   Metric4         12 non-null     float64
dtypes: float64(4), object(2)
memory usage: 9.0+ KB

Descriptive Statistics:


Unnamed: 0,Metric1,Metric2,Metric3,Metric4
count,189.0,189.0,60.0,12.0
mean,21354490.0,5630621.0,55933670.0,6.5
std,42778350.0,49389180.0,149331600.0,3.605551
min,2.0,24.31,18.64,1.0
25%,6.0,550.0,512.0,3.75
50%,509.0,3035.0,543.5,6.5
75%,39348780.0,492523.9,75923.59,9.25
max,227489600.0,661919400.0,831650400.0,12.0


In [12]:
# List numeric columns for visualization
numeric_cols = ['Metric1', 'Metric2', 'Metric3', 'Metric4']

# Plot histograms for each numeric column
plt.figure(figsize=(12, 8))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[col], kde=True, color='skyblue')
    plt.title(f'Histogram of {col}')
plt.tight_layout()
plt.show()


In [13]:
# Create a pair plot to visualize relationships between numeric variables
sns.pairplot(df[numeric_cols])
plt.suptitle('Pair Plot of Numeric Metrics', y=1.02)
plt.show()


In [15]:
# Reduce the dataframe to only numerical values for correlation analysis
numeric_df = df.select_dtypes(include=[np.number])

# Only plot the correlation heatmap if there are at least 4 numeric columns
if numeric_df.shape[1] >= 4:
    plt.figure(figsize=(8, 6))
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Numeric Variables')
    plt.show()
else:
    print('Not enough numeric columns for correlation heatmap.')


In [16]:


# Check for missing values in the dataframe
print('Missing values by column:')
print(df.isnull().sum())

# If missing values are found, handle them appropriately. For this dataset, we'll fill missing numeric values with the median
numeric_cols = ['Metric1', 'Metric2', 'Metric3', 'Metric4']
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f'Filled missing values in {col} with median: {median_val}')

# For non-numeric columns, a common approach is to fill missing values with 'Unknown'
for col in ['Report_Section', 'Dimension']:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Unknown', inplace=True)
        print(f'Filled missing values in {col} with "Unknown"')

Missing values by column:
Report_Section      0
Dimension           0
Metric1             0
Metric2             0
Metric3           129
Metric4           177
dtype: int64
Filled missing values in Metric3 with median: 543.5
Filled missing values in Metric4 with median: 6.5


In [18]:
# Prepare the data for predictive modeling
features = ['Metric1', 'Metric2', 'Metric3']
target = 'Metric4'

# For the purpose of this model, we'll make sure there are no missing values
model_df = df.dropna(subset=features + [target])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(model_df[features], model_df[target], test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr_model.predict(X_test)

# Evaluate the model using R² score
r2 = r2_score(y_test, y_pred)
print(f'R² score of the Linear Regression model: {r2:.4f}')

R² score of the Linear Regression model: -0.0218


In [19]:
# Calculate and plot Permutation Importance for the trained model
importances = permutation_importance(lr_model, X_test, y_test, n_repeats=10, random_state=42)

# Create a dataframe to organize the results
feature_importance = pd.DataFrame({'feature': features, 'importance': importances.importances_mean})
feature_importance.sort_values(by='importance', inplace=True)

# Plotting the permutation importance using a horizontal bar chart
plt.figure(figsize=(8, 4))
plt.barh(feature_importance['feature'], feature_importance['importance'], color='mediumpurple')
plt.xlabel('Mean Importance')
plt.title('Permutation Importance of Features')
plt.show()