In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cleaned_amazon_sales_data.csv')

# Initial exploration
print(df.info())  # Check data types and missing values
print(df.describe())  # Summary statistics
print(df.head())  # Inspect the first few rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Region          100 non-null    object 
 1   Country         100 non-null    object 
 2   Item_Type       100 non-null    object 
 3   Sales_Channel   100 non-null    object 
 4   Order_Priority  100 non-null    object 
 5   Order_Date      100 non-null    object 
 6   Order_ID        100 non-null    int64  
 7   Ship_Date       100 non-null    object 
 8   Units_Sold      100 non-null    int64  
 9   Unit_Price      100 non-null    float64
 10  Unit_Cost       100 non-null    float64
 11  Total_Revenue   100 non-null    float64
 12  Total_Cost      100 non-null    float64
 13  Total_Profit    100 non-null    float64
 14  Year            100 non-null    int64  
 15  Month           100 non-null    object 
dtypes: float64(5), int64(3), object(8)
memory usage: 12.6+ KB
None
           Order_I

In [3]:
# Convert 'Order_Date' and 'Ship_Date' to datetime
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y')
df['Ship_Date'] = pd.to_datetime(df['Ship_Date'], format='%d-%m-%Y')

# Create a 'Year-Month' feature
df['Year-Month'] = df['Order_Date'].dt.to_period('M')

# Check for any missing values
missing_values = df.isnull().sum()

# Aggregating sales data: Month-wise, Year-wise, Yearly-Month-wise
month_wise = df.groupby(df['Order_Date'].dt.month).agg({'Total_Revenue': 'sum', 'Total_Profit': 'sum'}).reset_index()
year_wise = df.groupby(df['Year']).agg({'Total_Revenue': 'sum', 'Total_Profit': 'sum'}).reset_index()
yearly_month_wise = df.groupby('Year-Month').agg({'Total_Revenue': 'sum', 'Total_Profit': 'sum'}).reset_index()

# Save the aggregated data
month_wise.to_csv('month_wise_sales.csv', index=False)
year_wise.to_csv('year_wise_sales.csv', index=False)
yearly_month_wise.to_csv('yearly_month_wise_sales.csv', index=False)


In [5]:
# Select only the numeric columns for correlation matrix calculation
numeric_df = df.select_dtypes(include=[float, int])

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Save the correlation matrix
correlation_matrix.to_csv('correlation_matrix.csv')
