# Step 4: Date Analysis and Visualisation

# 4.1 Univariate Analysis 

In [None]:
#Selecting and displaying the numerical columns (float64 and int64) from the DataFrame.
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
#Creating histograms for the numerical columns in the DataFrame,
#specifying figure size, bin count, and label font sizes for better visualization.
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations

### 4.1.1 Sales Quantity 

In [None]:
# unique value count
df['Quantity'].value_counts()

In [None]:
df_quantity_list = df['Quantity'] # Create new df for Quantity

In [None]:
df_quantity_list.describe() # Statistics analysis for created dataframe

In [None]:
# Frequency distribution of Quantity
# Histogram
plt.figure(figsize=(10, 3))
sns.histplot(df['Quantity'], bins=30, kde=True)
plt.title(f'Distribution of Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box plot: to identify outliers in the Quantity variable.
# Box plot: to compare the distributions of the Quantity variable across multiple groups or categories.

# Generate a box plot
plt.figure(figsize=(10, 3))
plt.boxplot(df['Quantity'])
plt.xlabel("Quantity")
plt.ylabel("Value")
plt.title("Box Plot of Quantity")
plt.show()

In [None]:
# Get the Quantity column from the DataFrame
quantity = df['Quantity']

# Calculate the mean and standard deviation of the Quantity column
mean = np.mean(quantity)
std = np.std(quantity)

# Determine the range of values that fall within each percentage range
lower_68 = mean - std
upper_68 = mean + std
lower_95 = mean - 2 * std
upper_95 = mean + 2 * std
lower_997 = mean - 3 * std
upper_997 = mean + 3 * std

# Size of figure
plt.figure(figsize=(10, 3))

# Create a normal distribution
x = np.random.normal(mean, std, 1000)

# Plot the normal distribution
plt.hist(x, bins=50, density=True)

# Plot the mean line
plt.axvline(mean, color='r', linestyle='--')

# Shade in the areas that fall within each percentage range
plt.axvspan(lower_68, upper_68, alpha=0.5, color='g')
plt.axvspan(lower_95, upper_95, alpha=0.25, color='b')
plt.axvspan(lower_997, upper_997, alpha=0.125, color='y')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normal Distribution of Quantity with 68%, 95%, and 99.7% Indicated')

# Show the plot
plt.show()

### 4.1.2. Unit Price

In [None]:
# unique value count
df['Price'].value_counts()

In [None]:
df_Price_list = df['Price']

In [None]:
df_Price_list.describe()

In [None]:
# Frequency distribution of price
# Histogram
plt.figure(figsize=(10, 3))
sns.histplot(df['Price'], bins=30, kde=True)
plt.title(f'Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box plot: to identify outliers in the Price variable.
# Box plot: to compare the distributions of the Quantity variable across multiple groups or categories.

# Generate a box plot
plt.figure(figsize=(10, 3))
plt.boxplot(df['Price'])
plt.xlabel("Price")
plt.ylabel("Value")
plt.title("Box Plot of Price")
plt.show()

In [None]:
# Size of figure
plt.figure(figsize=(10, 3))

# Create a violin plot of the Quantity variable
sns.violinplot(x='Price', data=df)

# Add a title to the plot
plt.title('Distribution of Price')

# Display the plot
plt.show()

In [None]:
# Get the Quantity column from the DataFrame
price = df['Price']

# Calculate the mean and standard deviation of the Quantity column
mean = np.mean(price)
std = np.std(price)

# Determine the range of values that fall within each percentage range
lower_68 = mean - std
upper_68 = mean + std
lower_95 = mean - 2 * std
upper_95 = mean + 2 * std
lower_997 = mean - 3 * std
upper_997 = mean + 3 * std

# Size of figure
plt.figure(figsize=(10, 3))

# Create a normal distribution
x = np.random.normal(mean, std, 1000)

# Plot the normal distribution
plt.hist(x, bins=50, density=True)

# Plot the mean line
plt.axvline(mean, color='r', linestyle='--')

# Shade in the areas that fall within each percentage range
plt.axvspan(lower_68, upper_68, alpha=0.5, color='g')
plt.axvspan(lower_95, upper_95, alpha=0.25, color='b')
plt.axvspan(lower_997, upper_997, alpha=0.125, color='y')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normal Distribution of Unit Price with 68%, 95%, and 99.7% Indicated')

# Show the plot
plt.show()

### 4.1.3 Sales Revenue

In [None]:
# unique value count 
df['Revenue'].value_counts()

In [None]:
df_Revenue_list = df['Revenue']

In [None]:
df_Revenue_list.describe()

In [None]:
# Frequency distribution of Revenue
plt.figure(figsize=(10, 3))
sns.histplot(df['Revenue'], bins=30, kde=True)
plt.title(f'Distribution of Revenue')
plt.xlabel('Revenue')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Size of figure
plt.figure(figsize=(10, 3))

# Create a violin plot of the Quantity variable
sns.violinplot(x='Revenue', data=df)

# Add a title to the plot
plt.title('Distribution of Revenue')

# Display the plot
plt.show()

In [None]:
# Box plot: to identify outliers in the Revenue variable.

# Generate a box plot
plt.figure(figsize=(10, 3))
plt.boxplot(df['Revenue'])
plt.xlabel("Revenue")
plt.ylabel("Value")
plt.title("Box Plot of Revenue")
plt.show()

In [None]:
#Boxplot of order values by customer segment: A boxplot of order values by customer segment can be used to compare the spending habits of different groups of customers. 
#This information can be used to develop targeted marketing campaigns or to create loyalty programs.
# Boxplot of order values by customer segment
df['CustomerSegment'] = pd.cut(df['Revenue'], bins=[0, 100, 500, 1000, 2000 , np.inf], labels=['Low spenders', 'Medium spenders', 'High spenders', 'Very high spenders', 'Top spenders'])
plt.figure(figsize=(10, 3))
sns.boxplot(x='CustomerSegment', y='Revenue', data=df)
plt.xlabel('Customer segment')
plt.xticks(rotation=90)
plt.ylabel('Order value')
plt.title('Boxplot of Revenue by customer segment')
plt.show()

In [None]:
# Get the Quantity column from the DataFrame
revenue = df['Revenue']

# Calculate the mean and standard deviation of the Quantity column
mean = np.mean(revenue)
std = np.std(revenue)

# Determine the range of values that fall within each percentage range
lower_68 = mean - std
upper_68 = mean + std
lower_95 = mean - 2 * std
upper_95 = mean + 2 * std
lower_997 = mean - 3 * std
upper_997 = mean + 3 * std

# Size of figure
plt.figure(figsize=(10, 3))

# Create a normal distribution
x = np.random.normal(mean, std, 1000)

# Plot the normal distribution
plt.hist(x, bins=50, density=True)

# Plot the mean line
plt.axvline(mean, color='r', linestyle='--')

# Shade in the areas that fall within each percentage range
plt.axvspan(lower_68, upper_68, alpha=0.5, color='g')
plt.axvspan(lower_95, upper_95, alpha=0.25, color='b')
plt.axvspan(lower_997, upper_997, alpha=0.125, color='y')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normal Distribution of Revenue with 68%, 95%, and 99.7% Indicated')

# Show the plot
plt.show()

### 4.1.4 - Month 

In [None]:
# unique value count
df['Month'].value_counts()

In [None]:
# Frequency distribution of Revenue
plt.figure(figsize=(10, 3))
sns.histplot(df['Month'], bins=30, kde=True)
plt.title(f'Distribution of Month')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Size of figure
plt.figure(figsize=(10, 3))

# Create a violin plot of the Quantity variable
sns.violinplot(x='Month', data=df)

# Add a title to the plot
plt.title('Distribution of Month')

# Display the plot
plt.show()

### 4.1.5 - Day

In [None]:
# unique value count
df['Day'].value_counts()

In [None]:
# Frequency distribution of Day
plt.figure(figsize=(10, 3))
sns.histplot(df['Day'], bins=30, kde=True)
plt.title(f'Distribution of Day')
plt.xlabel('Day')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Size of figure
plt.figure(figsize=(10, 3))

# Create a violin plot of the Quantity variable
sns.violinplot(x='Day', data=df)

# Add a title to the plot
plt.title('Distribution of Day')

# Display the plot
plt.show()

### 4.1.6 - Hour

In [None]:
# unique value count
df['Hour'].value_counts()

In [None]:
# Frequency distribution of Hour
plt.figure(figsize=(10, 3))
sns.histplot(df['Hour'], bins=30, kde=True)
plt.title(f'Distribution of Hour')
plt.xlabel('Hour')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Size of figure
plt.figure(figsize=(10, 3))

# Create a violin plot of the Quantity variable
sns.violinplot(x='Hour', data=df)

# Add a title to the plot
plt.title('Distribution of Hour')

# Display the plot
plt.show()

### 4.1.7. Product Description

In [None]:
# unique value count
df['Description'].value_counts()

In [None]:
# Frequency distribution of Description
plt.figure(figsize=(10, 3))
sns.histplot(df['Description'], bins=30, kde=True)
plt.title(f'Distribution of Description')
plt.xlabel('Description')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Categorical Variables
# Top 10 Product - in frequency

categorical_vars = ['Description']
for var in categorical_vars:
    plt.figure(figsize=(10, 3))
    sns.countplot(data= df, y=var, order=df[var].value_counts().index[:10])
    plt.title(f'Top ten {var}')
    plt.xticks(rotation=90)
    plt.xlabel('Count')
    plt.ylabel(var)
    plt.show()

### 4.1.8. Country 

In [None]:
# unique value count
df['Country'].value_counts()

In [None]:
# Frequency distribution of Country
plt.figure(figsize=(10, 3))
sns.histplot(df['Country'], bins=30, kde=True)
plt.title(f'Distribution of Country')
plt.xlabel('Country')
plt.xticks(rotation=90)
plt.ylabel('Frequency')
plt.show()

In [None]:
# Categorical Variables
# Top 10 Country - in frequency

categorical_vars = ['Country']
for var in categorical_vars:
    plt.figure(figsize=(8, 3))
    sns.countplot(data=df, x=var, order=df[var].value_counts().index[:10])
    plt.title(f'Top ten {var}')
    plt.xticks(rotation=45)
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.show()

# 4.2. Bivariate Analysis

## 4.2.1 Numeric vs Numeric 

### 4.2.1.1. Quantity Vs Price

In [None]:
plt.figure(figsize=(10, 3))

# Create a scatter plot to visualize the relationship between 'Quantity' and 'Price'
sns.scatterplot(x=df['Quantity'], y=df['Price'])

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Price')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Price')

# Show the plot
plt.show()

In [None]:
# Correlation heatmap between Quantity and Price
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Price']
correlation_matrix = df[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Group the data by Price
grouped_df = df.groupby('Price')

# Calculate the total quantity for price
total_quantity_per_price = grouped_df['Quantity'].sum()

# Create a bar plot
plt.figure(figsize=(10, 3))
plt.bar(total_quantity_per_price.index, total_quantity_per_price.values)

# Set the labels for the x- and y-axes
plt.xlabel('Price')
plt.ylabel('Quantity')

# Set the title of the plot
plt.title('Total Quantity by Unit Price')

# Show the plot
plt.show()

### 4.2.1.2 Quantity Vs Revenue

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Quantity' and 'Revenue'
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))  # Set the figure size
attributes = ['Quantity', 'Revenue']  # Specify the columns for correlation analysis
correlation_matrix = df[attributes].corr()  # Compute the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')  # Create the heatmap with annotations and color map
plt.title('Correlation Heatmap Quantity vs Revenue')  # Set the title for the heatmap
plt.show()  # Display the heatmap

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Quantity' and 'Revenue'
sns.scatterplot(x=df['Quantity'], y=df['Revenue'])

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Revenue')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Revenue')

# Show the plot
plt.show()


In [None]:

# Group the data by day
grouped_df = df.groupby('Quantity')

# Calculate the total quantity for price
total_quantity_per_Revenue = grouped_df['Revenue'].sum()

plt.figure(figsize=(10, 3))  # Set the figure size

# Create a bar plot
plt.bar(total_quantity_per_Revenue.index, total_quantity_per_Revenue.values)

# Set the labels for the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Revenue')

# Set the title of the plot
plt.title('Total Revenue by Quantity')

# Show the plot
plt.show()

### 4.2.1.3 Quantity Vs Month

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2010
df_2010 = df.query('Month >= 1 and Month <= 12 and Year == 2010')

# Create a scatter plot of the Quantity and Month columns in the new DataFrame
sns.scatterplot(x='Quantity', y='Month', data=df_2010)

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Month')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Month (2010)')

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2011
df_2011 = df.query('Month >= 1 and Month <= 12 and Year == 2011')

# Create a scatter plot of the Quantity and Month columns in the new DataFrame
sns.scatterplot(x='Quantity', y='Month', data=df_2011)

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Month')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Month (2011)')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and quantity
Monthly_Quantity = df.pivot_table(values= ['Quantity'], index=['Year', 'Month'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Monthly_Quantity = Monthly_Quantity.reset_index()

print(df_Monthly_Quantity)

In [None]:
# Exploring Quantity trends over time
plt.figure(figsize=(10, 3))
sns.barplot(x= 'Month', y='Quantity', hue ='Year', data=df_Monthly_Quantity)

# Set the labels and title
plt.xlabel('Month')
plt.ylabel('Sales Quantity')
plt.title('Sales Quantity trend over time - Monthly')
plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size
# Create a line plot 
df_Monthly_Quantity['Quantity'].plot()

# Set the labels and title
plt.xlabel('Month')
plt.ylabel('Sales Quantity')
plt.title('Sales Quantity trend over time - Monthly')
plt.show()

In [None]:
# Correlation heatmap between Quantity and Price
plt.figure(figsize=(10, 3))
attributes = ['Quantity','Month']
correlation_matrix = df_Monthly_Quantity[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Quantity and Month')
plt.show()

### 4.2.1.4 Quantity Vs Day

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Quantity' and 'Day'
sns.scatterplot(x=df['Quantity'], y=df['Day'])

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Day')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Day')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and quantity
Daily_Quantity = df.pivot_table(values= ['Quantity'], index=['Year', 'Month', 'Day'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Daily_Quantity = Daily_Quantity.reset_index()

print(df_Daily_Quantity)

In [None]:
# Group the DataFrame by month
month_groups = df_Daily_Quantity.groupby(['Month', 'Year'])

# Iterate through the groups and create a bar plot for each group
for month, group in month_groups:
    plt.figure(figsize=(10, 3))
    sns.barplot(x='Day', y='Quantity', data=group)

    # Set the labels and title
    plt.xlabel('Day')
    plt.ylabel('Sales Quantity')
    plt.title(f'Sales Quantity trend over time for month no: {month} ')

    # Show the plot
    plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a line plot
df_Daily_Quantity['Quantity'].plot()

# set labels and Titles 
plt.xlabel('Day')
plt.ylabel('Sales Quantity')
plt.title('Sales Quantity trend over time- Daily')
plt.show()

In [None]:
# Correlation heatmap between Quantity and Price
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Day']
correlation_matrix = df_Daily_Quantity[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Quantity and Day')
plt.show()

### 4.2.1.5 Quantity Vs Hour

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Quantity' and 'Month'
sns.scatterplot(x=df['Quantity'], y=df['Hour'])

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Hour')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Hour')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and quantity
Hourly_Quantity = df.pivot_table(values= ['Quantity'], index=['Year', 'Month', 'Day','Hour'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Hourly_Quantity = Hourly_Quantity.reset_index()

print(df_Hourly_Quantity)

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# create a line plot 
df_Hourly_Quantity['Hour'].plot()

# set labels and Titles
plt.xlabel('Hour')
plt.ylabel('Sales Quantity')
plt.title('Sales Quantity trend over time - Hourly')
plt.show()

In [None]:
# Correlation heatmap between Quantity and Price
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Hour']
correlation_matrix = df_Hourly_Quantity[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap bwtween Quantity and Hour')
plt.show()

### 4.2.1.6. Revenue Vs Price

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Quantity' and 'Price'
sns.scatterplot(x=df['Revenue'], y=df['Price'])

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Price')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Price')

# Show the plot
plt.show()

In [None]:
# Correlation heatmap between revenue and Price
plt.figure(figsize=(10, 2))
attributes = ['Revenue','Price']
correlation_matrix = df[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Revenue and Unit Price')
plt.show()

In [None]:
# Group the data by day
grouped_df = df.groupby('Price')

# Calculate the total quantity for price
total_revenue_per_price = grouped_df['Revenue'].sum()

plt.figure(figsize=(10, 3))  # Set the figure size

# Create a bar plot
plt.bar(total_revenue_per_price.index, total_revenue_per_price.values)

# Set the labels for the x- and y-axes
plt.xlabel('Price')
plt.ylabel('Revenue')

# Set the title of the plot
plt.title('Total Revenue by Unit Price')

# Show the plot
plt.show()

### 4.2.1.7. Revenue Vs Month

In [None]:

plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2010
df_2010 = df.query('Month >= 1 and Month <= 12 and Year == 2010')

# Create a scatter plot of the Revenue and Month columns in the new DataFrame
sns.scatterplot(x='Revenue', y='Month', data=df_2010)

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Month')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Month (2010)')

# Show the plot
plt.show()

In [None]:

plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2011
df_2010 = df.query('Month >= 1 and Month <= 12 and Year == 2011')

# Create a scatter plot of the Revenue and Month columns in the new DataFrame
sns.scatterplot(x='Revenue', y='Month', data=df_2011)

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Month')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Month (2011)')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and Revenue
Monthly_Revenue = df.pivot_table(values= ['Revenue'], index=['Year', 'Month'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Monthly_Revenue = Monthly_Revenue.reset_index()

print(df_Monthly_Revenue)

In [None]:
# Exploring Quantity trends over time
plt.figure(figsize=(10, 3))
sns.barplot(x= 'Month', y='Revenue', hue ='Year', data=df_Monthly_Revenue)

# Set the labels and title
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.title('Sales Revenue trend over time - Monthly')
plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a line plot 
df_Monthly_Revenue['Revenue'].plot()

# Set label and title 
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.title('Sales Revenue trend over time - Monthly')
plt.show()

In [None]:
# Correlation heatmap between Revenue and Month
plt.figure(figsize=(10, 2)) # size of plot 
attributes = ['Revenue','Month']
correlation_matrix = df_Monthly_Revenue[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Revenue and Month')
plt.show()

### 4.2.1.8. Revenue Vs Day

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'revenue' and 'Day'
sns.scatterplot(x=df['Revenue'], y=df['Day'])

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Day')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Day')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and quantity
Daily_Revenue = df.pivot_table(values= ['Revenue'], index=['Year', 'Month', 'Day'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Daily_Revenue = Daily_Revenue.reset_index()

print(df_Daily_Revenue)

In [None]:
# Group the DataFrame by month
month_groups = df_Daily_Revenue.groupby(['Month', 'Year'])

# Iterate through the groups and create a bar plot for each group
for month, group in month_groups:
    plt.figure(figsize=(10, 3))
    sns.barplot(x='Day', y='Revenue', data=group)

    # Set the labels and title
    plt.xlabel('Day')
    plt.ylabel('Sales Revenue')
    plt.title(f'Sales Revenue trend over time for month no: {month} ')

    # Show the plot
    plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# create a line plot 
df_Daily_Revenue['Revenue'].plot()

# Set the labels and title
plt.xlabel('Day')
plt.ylabel('Sales Revenue')
plt.title('Sales Revenue trend over time - Daily')
plt.show()

In [None]:
# Correlation heatmap between Quantity and Price
plt.figure(figsize=(10, 2))
attributes = ['Revenue','Day']
correlation_matrix = df_Daily_Revenue[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

### 4.2.1.9. Revenue Vs Hour

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Revenue' and 'Hour'
sns.scatterplot(x=df['Revenue'], y=df['Hour'])

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Hour')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Hour')

# Show the plot
plt.show()

In [None]:
# Create a pivot table of year, month, and quantity
Hourly_revenue = df.pivot_table(values= ['Revenue'], index=['Year', 'Month', 'Day','Hour'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Hourly_revenue = Hourly_revenue.reset_index()

print(df_Hourly_revenue)

In [None]:
plt.figure(figsize=(10, 3)) # size of the plot

# create a line plot 
df_Hourly_revenue['Hour'].plot()

# Set the labels and title
plt.xlabel('Hour')
plt.ylabel('Sales Revenue')
plt.title('Sales Revenue trend over time - Hourly')
plt.show()

In [None]:
# Correlation heatmap between revenue and Hour
plt.figure(figsize=(10, 2))
attributes = ['Revenue','Hour']
correlation_matrix = df_Hourly_revenue[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Revenue and Hour')
plt.show()

## 4.2.2 Numeric vs Nominal

### 4.2.2.1 Quantity Vs Product Description

In [None]:
# Exploring top ten product with maximum quantity sold 
plt.figure(figsize=(10, 3))
df_High_Quantity_10 = df.sort_values('Quantity', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x='Quantity', y='Description', data=df_High_Quantity_10)

# Set the labels and title
plt.xlabel('Quantity')
plt.title('Top ten product with maximum quantity sold in single transaction')
plt.show()

In [None]:
# Create a pivot table of Description and quantity
Description_Quantity = df.pivot_table(values= ['Quantity'], index=['Description'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Description_Quantity= Description_Quantity.reset_index()

print(df_Description_Quantity)

In [None]:
# Exploring product popularity by Quantity sold
plt.figure(figsize=(10, 3))
df_Description_Quantity_Top10 = df_Description_Quantity.sort_values('Quantity', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x='Quantity', y='Description', data=df_Description_Quantity_Top10)

# Set the labels and title
plt.xlabel('Sales Quantity')
plt.title('Top 10 Popular Products by Total sales quantity')
plt.show()

### 4.2.2.2 Quantity Vs Country

In [None]:
# Create a pivot table of year, month, and quantity
Country_Quantity = df.pivot_table(values= ['Quantity'], index=['Country'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Country_Quantity = Country_Quantity.reset_index()

print(df_Country_Quantity)

In [None]:
# Exploring sales quantity trends over time
plt.figure(figsize=(10, 3))
df_High_Quen_10 = df_Country_Quantity.sort_values('Quantity', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x= 'Quantity', y='Country', data=df_High_Quen_10)

# Set the labels and title
plt.xlabel('Quantity')
plt.ylabel('Country')
plt.title('Top 10 Countries by Sales Quantity')
plt.show()


### 3.2.2.3. Revenue Vs Product Description

In [None]:
# Exploring product with High unit price
plt.figure(figsize=(10, 3))
df_High_Rev_10 = df.sort_values('Revenue', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x='Revenue', y='Description', data=df_High_Rev_10)

# Set the labels and title
plt.xlabel('Revenue')
plt.title('Top ten High Revenue products')
plt.show()

In [None]:
# Create a pivot table of Description and quantity
Description_Revenue = df.pivot_table(values= ['Revenue'], index=['Description'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Description_Revenue= Description_Revenue.reset_index()


print(df_Description_Revenue)

In [None]:
# Exploring product popularity by Quantity sold
plt.figure(figsize=(10, 3))
df_Description_Revenue_Top10 = df_Description_Revenue.sort_values('Revenue', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x='Revenue', y='Description', data=df_Description_Revenue_Top10)

# Set the labels and title
plt.xlabel('Sales Revenue')
plt.title('Top 10 Popular Products by cumulative sales Revenue')
plt.show()

### 4.2.2.4 Revenue Vs Country

In [None]:
# Create a pivot table of year, month, and quantity
Country_Revenue = df.pivot_table(values= ['Revenue'], index=['Country'], aggfunc='sum')

# Reset the index of the pivot table to convert it to a DataFrame
df_Country_Revenue = Country_Revenue.reset_index()

print(df_Country_Revenue)

In [None]:
plt.figure(figsize=(10, 3))
df_High_Reve_10 = df_Country_Revenue.sort_values('Revenue', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x= 'Revenue', y='Country', data=df_High_Reve_10)

# Set the labels and title
plt.xlabel('Revenue')
plt.ylabel('Country')
plt.title('Top 10 Countries by Sales Revenue')
plt.show()

### 4.2.2.5 Unit Price Vs Product Description

In [None]:
# Create a pivot table of year, month, and quantity
Product_AvePrice = df.pivot_table(values= ['Price'], index=['Description'], aggfunc='mean')

# Reset the index of the pivot table to convert it to a DataFrame
df_Product_AvePrice = Product_AvePrice.reset_index()

print(df_Product_AvePrice)

In [None]:
plt.figure(figsize=(10, 3))
df_High_Price_10 = df_Product_AvePrice.sort_values('Price', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x= 'Price', y='Description', data=df_High_Price_10)

# Set the labels and title
plt.xlabel('Average Unit Price')
plt.ylabel('Product ')
plt.title('Top 10 Products by Average Unit Price')
plt.show()

In [None]:
df_Product_AvePrice.describe()

In [None]:
plt.figure(figsize=(10, 3))
df_Low_Price_10 = df_Product_AvePrice.sort_values('Price', ascending=True).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x= 'Price', y='Description', data=df_Low_Price_10)

# Set the labels and title
plt.xlabel('Average Unit Price')
plt.ylabel('Product ')
plt.title('Least 10 Products by Average Unit Price')
plt.show()

In [None]:
# Exploring product with High unit price
plt.figure(figsize=(10, 3))
df_High_Price_10 = df.sort_values('Price', ascending=False).head(10)

# create a bar plot using the Seaborn library
sns.barplot(x='Price', y='Description', data=df_High_Price_10)

# Set the labels and title
plt.xlabel('Unit Price')
plt.title('Top 10 High unit price products')
plt.show()

In [None]:
# Exploring product with Low unit price
plt.figure(figsize=(10, 3))
df_Low_Price_10 = df.sort_values('Price', ascending=True).head(10)
sns.barplot(x='Description', y='Price', data=df_Low_Price_10)

# Set the labels and title
plt.xlabel('Product')
plt.title('Top 10 low price products')
plt.show()

In [None]:
# Get the Quantity column from the DataFrame
Aprice = df_Product_AvePrice['Price']

# Calculate the mean and standard deviation of the Quantity column
mean = np.mean(Aprice)
std = np.std(Aprice)

# Determine the range of values that fall within each percentage range
lower_68 = mean - std
upper_68 = mean + std
lower_95 = mean - 2 * std
upper_95 = mean + 2 * std
lower_997 = mean - 3 * std
upper_997 = mean + 3 * std

# Size of figure
plt.figure(figsize=(10, 3))

# Create a normal distribution
x = np.random.normal(mean, std, 1000)

# Plot the normal distribution
plt.hist(x, bins=50, density=True)

# Plot the mean line
plt.axvline(mean, color='r', linestyle='--')

# Shade in the areas that fall within each percentage range
plt.axvspan(lower_68, upper_68, alpha=0.5, color='g')
plt.axvspan(lower_95, upper_95, alpha=0.25, color='b')
plt.axvspan(lower_997, upper_997, alpha=0.125, color='y')

# Add labels and title
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normal Distribution of Average Unit Price of Product with 68%, 95%, and 99.7% Indicated')

# Show the plot
plt.show()

## 4.2.3  Nominal vs Nominal

### 4.2.3.1 Country Vs  Product Desctiption

In [None]:
# Create a contingency table and visualize it using a heatmap to analyze the relationship between 'Country' and 'Description'
contingency_table = pd.crosstab(df['Description'], df['Country'])  # Create the contingency table

plt.figure(figsize=(16, 30))  # Set the figure size

sns.heatmap(contingency_table, cmap='RdBu', vmin=0, vmax=100)  # Create the heatmap

# Set the labels and title
plt.title('Country vs. Description')  # Set the title
plt.xlabel('Country')  # Set the x-axis label
plt.ylabel('Description')  # Set the y-axis label
plt.show()  # Display the heatmap

In [None]:
# Group the Data Frame by country
grouped_df = df.groupby('Country')

# Count the number of unique products purchased by each country
unique_products = grouped_df['Description'].nunique()


# Convert the correlation coefficients into a DataFrame
df_unique = pd.DataFrame({'Country': unique_products.index,'Unique Products': unique_products.values})

# Return the new DataFrame
print(df_unique)

In [None]:
plt.figure(figsize=(4, 10)) # Size of the plot 

# create a bar plot using the Seaborn library
sns.barplot(x= 'Unique Products', y='Country', data=df_unique)

# Set the labels and title
plt.xlabel('Unique Products')
plt.ylabel('Country')
plt.title('Unique products by Country')
plt.show()

# 4.3 Multivariate analysis

### 4.3.1 Monthly - Quantity - Revenue

In [None]:
# Add the column Revenue from df_Monthly_Revenue to df_Monthly_Quantity using join()
df_Monthly = df_Monthly_Quantity.join(df_Monthly_Revenue[['Revenue']], how='outer')

# Print the DataFrame
print(df_Monthly)

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2010
df_2010 = df_Monthly.query('Month >= 1 and Month <= 12 and Year == 2010')

# Create a scatter plot of the Quantity and Revenue columns in the new DataFrame
sns.scatterplot(x='Quantity', y='Revenue', data=df_2010)

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Revenue')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Revenue in Month(2010)')

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Select only months related to 2011
df_2011 = df_Monthly.query('Month >= 1 and Month <= 12 and Year == 2011')

# Create a scatter plot of the Quantity and Revenue columns in the new DataFrame
sns.scatterplot(x='Quantity', y='Revenue', data=df_2011)

# Add labels to the x- and y-axes
plt.xlabel('Quantity')
plt.ylabel('Revenue')

# Add a title to the plot
plt.title('Scatter Plot of Quantity vs. Revenue in Month(2011)')

# Show the plot
plt.show()

In [None]:
# Group the DataFrame by Year  and Month
grouped_df = df.groupby(['Year', 'Month'])


# Calculate the correlation coefficient between quantity and revenue for each month and year 
correlations = grouped_df.apply(lambda x: x['Quantity'].corr(x['Revenue']))

# Print the correlation coefficients
print(correlations)

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Total Quantity' and 'Total Revenue' of each month
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Revenue','Month']
correlation_matrix = df_Monthly[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap - Month vs Quantity vs Revenue')
plt.show()

### 4.3.2  Country - Quantity- Revenue

In [None]:
# Add the column Revenue from df_Country_Revenue to df_Country_Quantity using join()
df_Country = df_Country_Quantity.join(df_Country_Revenue[['Revenue']], how='outer')

# Print the DataFrame
print(df_Country)

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Total Revenue' and 'Total Quantity' of each Country 
sns.scatterplot(x=df_Country['Revenue'], y=df_Country['Quantity'])

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Quantity')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Quantity - Country based')

# Show the plot
plt.show()

In [None]:
## Create a correlation heatmap to visualize the relationships between 'Total Quantity', 'Total Revenue', of each country
# The heatmap uses color intensity to represent the strength and direction of correlations
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Revenue']
correlation_matrix = df_Country[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap between Quanity and  Revenue - Country based')
plt.show()

In [None]:
# Group the DataFrame by country and product
grouped_df = df.groupby(['Country'])

# Calculate the correlation coefficient between quantity and revenue for each country
correlations = grouped_df.apply(lambda x: x['Quantity'].corr(x['Revenue']))

# Convert the correlation coefficients into a DataFrame
df_correlations = pd.DataFrame({'Country': correlations.index,
                               'Correlation': correlations.values})

# Return the new DataFrame
print(df_correlations)

In [None]:
plt.figure(figsize=(4, 10))
sns.barplot(x= 'Correlation', y='Country', data=df_correlations)

# Set the labels and title
plt.xlabel('Correlation')
plt.ylabel('Country')
plt.title('Correlation between Quantity and Revenue within Country')
plt.show()

In [None]:
# Create a pairplot of Quantity vs Revenue.
sns.pairplot(df_Country[['Quantity', 'Revenue']])

# Display the plot
plt.show()

### 4.3.3 Product - Quantity - Revenue 

In [None]:
# Add the column Revenue from df_Description_Revenue to df_Description_Quantity using join()
df_Description = df_Description_Quantity.join(df_Description_Revenue[['Revenue']], how='outer')

# Print the DataFrame
print(df_Description)

In [None]:
plt.figure(figsize=(10, 3))  # Set the figure size

# Create a scatter plot to visualize the relationship between 'Total Revenue' and 'Total Quantity' of each product category
sns.scatterplot(x=df_Description['Revenue'], y=df_Description['Quantity'])

# Add labels to the x- and y-axes
plt.xlabel('Revenue')
plt.ylabel('Quantity')

# Add a title to the plot
plt.title('Scatter Plot of Revenue vs. Quantity - Product based')

# Show the plot
plt.show()

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Total Quantity' and 'Total Revenue' of Product category 
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Revenue']
correlation_matrix = df_Description[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of product based Quantity and Revenue')
plt.show()

In [None]:
# Create the pair plot
sns.pairplot(df_Description)

# Show the plot
plt.show()

In [None]:
# Group the data by Description
grouped_df = df.groupby('Description')

# Calculate the correlation coefficient between quantity and revenue for each Description
correlations = grouped_df.apply(lambda x: x['Quantity'].corr(x['Revenue']))

# Filter the data to get the StockCodes with the highest and lowest correlation coefficients
highly_positive_10 = correlations[correlations > 0.9].index.tolist()[:10]
highly_negative_10 = correlations[correlations < -0.9].index.tolist()[:10]

# Filter the data to get the StockCodes with a correlation coefficient close to zero
no_relationship_10 = correlations[(correlations < 0.1) & (correlations > -0.1)].index.tolist()[:10]

# Print the results
print('Highly positive:')
print(highly_positive_10)
print('Highly negative:')
print(highly_negative_10)
print('No relationship:')
print(no_relationship_10)

### 3.3.4 Product - Av Unit Price - Av Quantity

In [None]:
# Create a pivot table of Description and Average quantity, Average unit price,
Product_AverageQ = df.pivot_table(values= ['Quantity','Price' ], index=['Description'], aggfunc='mean')

# Reset the index of the pivot table to convert it to a DataFrame
df_Product_AverageQ = Product_AverageQ.reset_index()

print(df_Product_AverageQ)

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Average Quantity' and 'Average unit price' per product
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))
attributes = ['Quantity','Price']
correlation_matrix = df_Product_AverageQ[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of product based Average Quantity and Average price')
plt.show()

In [None]:
# Create the pair plot
sns.pairplot(df_Product_AverageQ)

# Show the plot
plt.show()

### 4.3.5 Product - Av Unit Price - Av Revenue

In [None]:
# Create a pivot table of Description and Average unit price, Average Revenue
Product_AverageR = df.pivot_table(values= ['Revenue','Price' ], index=['Description'], aggfunc='mean')

# Reset the index of the pivot table to convert it to a DataFrame
df_Product_AverageR = Product_AverageR.reset_index()

print(df_Product_AverageR)

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Average unit price' and 'Average Revenue'
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))
attributes = ['Revenue','Price']
correlation_matrix = df_Product_AverageR[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of product based Average Revenue and Average price')
plt.show()

In [None]:
# Create the pair plot
sns.pairplot(df_Product_AverageR)

# Show the plot
plt.show()

### 4.3.6 Product - Av Quantity - Av Revenue

In [None]:
# Create a pivot table of Description and Average quantity, Average Revenue
Product_AverageQR = df.pivot_table(values= ['Revenue','Quantity' ], index=['Description'], aggfunc='mean')

# Reset the index of the pivot table to convert it to a DataFrame
df_Product_AverageQR = Product_AverageQR.reset_index()

print(df_Product_AverageQR)

In [None]:
# Create a correlation heatmap to visualize the relationships between 'Average Quantity' and 'Average Revenue' per product 
# The heatmap uses color intensity to represent the strength and direction of the correlation
plt.figure(figsize=(10, 2))
attributes = ['Revenue','Quantity']
correlation_matrix = df_Product_AverageQR[attributes].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of product based Average Revenue and Average Quantity')
plt.show()

In [None]:
# Create the pair plot
sns.pairplot(df_Product_AverageQR)

# Show the plot
plt.show()

 --------------------------------------------------------------------------------------------------------End ---------------------------------------------------------------------------------------------------