In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# visualization
import seaborn as sns

# Importing the dataset
dataset = pd.read_csv(r'D:\Download/cleaned.csv')

# list of first five rows
dataset.head()

In [None]:
dataset

In [None]:
# list of last five rows
dataset.tail()

In [None]:
# shape
dataset.shape

**Inference:** Dataset comprises 45166 Rows and 12 columns.

In [None]:
dataset.info()

**Inference:** No Variable column has null/missing values

In [None]:
# The statistical summary of the dataset.
dataset.describe()

In [None]:
# The statistical summary of the dataset.
dataset['rating'].describe()

**Inference:** The statistical summary of the dataset gives us the following information:
1. The mean rating is **4.2**.
2. The minimum rating is **1**.
3. The maximum rating is **5**.
4. The standard deviation of the ratings is **1.22**.
5. The 25th percentile of the ratings is **4**.
6. The 50th percentile of the ratings is **5**.
7. The 75th percentile of the ratings is **5**.

In [None]:
# the number of unique users and items in the dataset.
dataset.nunique()

#### Dealing With Missing Values

In [None]:
# check for missing values
dataset.isnull().sum()

**Inference:** No Variable column has null values

In [None]:
# Group by year and sum the sales amounts
sales_per_year = dataset.groupby('year')['amount'].sum().reset_index()

# Find the year with the highest sales
highest_sales_year = sales_per_year.loc[sales_per_year['amount'].idxmax()]

# Display the result
print(f"The highest sales year is {highest_sales_year['year']} with total sales of ${highest_sales_year['amount']}.")


In [None]:
# Total Revenue
total_revenue = dataset['amount'].sum()
print(total_revenue)

### **Data Visualizations**

#### What was the best year of sales?

In [None]:
# Grouping the dataset by 'year' and counting the 'amount' for each year
grouped_data = dataset.groupby('year')['amount'].count()

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(grouped_data.index, grouped_data.values)

# Adding title and labels
plt.title('Year Wise Sales')
plt.xlabel('Year')
plt.ylabel('Count of Sales')

# Displaying the plot
plt.show()

**Inference:**
1. From the graph we just plotted we can see that year 2015 had the best sales out of all years.
2. There was a steady increase of sales from the year 2007 to 2015 then a slight decline in 2016. That decline in sales was big in the following years of 2017 and 2018.


#### Which was the best month for sales between 2015 t0 2018?

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset_2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'month' and counting the 'rating'
grouped_data = dataset_2015_2018.groupby('month')['rating'].count()

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(grouped_data.index, grouped_data.values)

# Adding title and labels
plt.title('Month Wise Rating Count (2015-2018)')
plt.xlabel('Month')
plt.ylabel('Count of Ratings')

# Display the plot
plt.show()

**Inference:** *January* was the month when most sales were made across the product categories and over the years.

#### What brand sold the most in the highest selling year(2015 to 2018)?

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset_2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'brand' and summing the 'amount', then sorting it
grouped_data = dataset_2015_2018.groupby('brand')['amount'].sum().sort_values(ascending=False).head(10)

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(grouped_data.index, grouped_data.values)

# Adding title and labels
plt.title('Brand Wise Top 10 Sales (2015 to 2018)')
plt.xlabel('Brand')
plt.ylabel('Total Sales Amount')

# Display the plot
plt.show()

**Inference:** *Bose* was the brand with the most sales in 2015 to 2018 followed by Logitech.

#### What products sold the most in the four years 2015, 2016, 2017 & 2018?

In [None]:
# Create subplots with 2 rows and 2 columns
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plot for 2015
top_selling_2015 = dataset[dataset['year'] == 2015].groupby('brand')['rating'].count().sort_values(ascending=False).head(10)
axs[0, 0].bar(top_selling_2015.index, top_selling_2015)
axs[0, 0].set_title('Top Selling Products in 2015')
axs[0, 0].tick_params(axis='x', rotation=45)  

# Plot for 2016
top_selling_2016 = dataset[dataset['year'] == 2016].groupby('brand')['rating'].count().sort_values(ascending=False).head(10)
axs[0, 1].bar(top_selling_2016.index, top_selling_2016)
axs[0, 1].set_title('Top Selling Products in 2016')
axs[0, 1].tick_params(axis='x', rotation=45)  

# Plot for 2017
top_selling_2017 = dataset[dataset['year'] == 2017].groupby('brand')['rating'].count().sort_values(ascending=False).head(10)
axs[1, 0].bar(top_selling_2017.index, top_selling_2017)
axs[1, 0].set_title('Top Selling Products in 2017')
axs[1, 0].tick_params(axis='x', rotation=45) 

# Plot for 2018
top_selling_2018 = dataset[dataset['year'] == 2018].groupby('brand')['rating'].count().sort_values(ascending=False).head(10)
axs[1, 1].bar(top_selling_2018.index, top_selling_2018)
axs[1, 1].set_title('Top Selling Products in 2018')
axs[1, 1].tick_params(axis='x', rotation=45)  

# Adjust layout for better appearance
plt.tight_layout()

# Show the plots
plt.show()

**Inference:**
1. There has been one consistent Brand product with the most sales in the 4 years and it is Bose.
2. The second most sold brand's products have been Logitech.
* 2015 (Bose and Logitech)
* 2016 (Bose and Logitech)
* 2017 (Bose and Logitech)
* 2018 (Bose and Logitech)

#### What product by category sold the most between 2015 to 2018?

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'category' and summing the 'amount', then sorting it
top_categories = dataset2015_2018.groupby('category')['amount'].sum().sort_values(ascending=False).head(10)

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(top_categories.index, top_categories.values)

# Adding title and labels
plt.title('Top 10 Most Sold Product Categories (2015 to 2018)')
plt.xlabel('Product Category')
plt.ylabel('Total Sales Amount')
plt.xticks(rotation=45)  

# Display the plot
plt.tight_layout()
plt.show()

**Inference:** We can see that the category of *Headphones* sold the most, *Computers* and *Accessories* were sold the second most while *Cameras* sold the third most .

#### What product by category sold the least between 2015 to 2018?

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'category' and summing the 'amount', then sorting it
top_categories = dataset2015_2018.groupby('category')['amount'].sum().sort_values(ascending=True).head(10)

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(top_categories.index, top_categories.values)

# Adding title and labels
plt.title('Top 10 Least Sold Product Categories (2015 to 2018)')
plt.xlabel('Product Category')
plt.ylabel('Total Sales Amount')
plt.xticks(rotation=45)  

# Display the plot
plt.tight_layout()
plt.show()

**Inference:** We can see that the category of *Wearable Technology* sold the least followed closely by *Security* and *Surveillance*.

#### What product by brand name sold the least between 2015 to 2018?

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'brand' and summing the 'amount', then sorting it in ascending order
least_selling_brands = dataset2015_2018.groupby('brand')['amount'].sum().sort_values(ascending=True).head(10)

# Creating the bar plot
plt.figure(figsize=(10,6))
plt.bar(least_selling_brands.index, least_selling_brands.values)

# Adding title and labels
plt.title('10 Least Sold Product Brands (2015 to 2018)')
plt.xlabel('Brand')
plt.ylabel('Total Sales Amount')
plt.xticks(rotation=45) 

# Display the plot
plt.tight_layout()
plt.show()

**Inference:** *Archos* sold the least followed closely with *EINCAR*.

#### **Ratings Distribution**

In [None]:
# Filtering the dataset for the years between 2015 and 2018
dataset2015_2018 = dataset[(dataset['year'] >= 2015) & (dataset['year'] <= 2018)]

# Grouping the filtered dataset by 'brand' and calculating the mean of 'rating', then sorting in descending order
top_rated_brands = dataset2015_2018.groupby('brand')['rating'].mean().sort_values(ascending=False).head(10)

# Creating the bar plot
plt.figure(figsize=(8,6))
plt.bar(top_rated_brands.index, top_rated_brands.values)

# Adding title and labels
plt.title('10 Most Highly Rated Brands (2015 to 2018)')
plt.xlabel('Brand')
plt.ylabel('Average Rating')
plt.xticks(rotation=45) 

# Display the plot
plt.tight_layout()
plt.show()

**Inference:** *Savage* and *Plemo* were the brands with the highest ratings.

#### **Top 5 category sales percentage**

In [None]:
import matplotlib.pyplot as plt

# Grouping the dataset by 'category' and summing the 'amount', then sorting it in descending order
top_categories = dataset.groupby('category')['amount'].sum().sort_values(ascending=False).head(5)

# Creating the pie chart
plt.figure(figsize=(6,6))
plt.pie(top_categories, labels=top_categories.index, autopct='%1.1f%%', startangle=90)

# Adding title
plt.title('Top 5 Category Sales Percentage')

# Display the plot
plt.show()


**Inference:** *Headphones* sales % is the highest followed by *Computers* & *Accessories*.

#### **Brand wise sales percentage**

In [None]:
# Grouping the dataset by 'brand' and counting the 'rating', then sorting it in descending order
top_brands = dataset.groupby('brand')['rating'].count().sort_values(ascending=False).head(5)

# Creating the pie chart
plt.figure(figsize=(6,6))
plt.pie(top_brands, labels=top_brands.index, autopct='%1.1f%%', startangle=90)

# Adding title
plt.title('Top 5 Brand Wise Sales Percentage')

# Display the plot
plt.show()

**Inference:** *Bose* and *Logitech* sales % is the highest followed by *Sony*.

#### Gender wise customer distribution

In [None]:
# Counting the occurrences of each gender in the dataset
gender_distribution = dataset['gender'].value_counts()

# Creating the pie chart
plt.figure(figsize=(6, 6))
plt.pie(gender_distribution, labels=gender_distribution.index, autopct='%1.1f%%', startangle=90, colors=['#A94064', '#008ECC'])

# Adding title
plt.title('Gender Wise Customer Distribution')

# Display the plot
plt.show()

**Inference:** Most of the customers are in *Female* categories.

#### **Conclusion:**
* **2015** was the best year in terms of sales and profit
* **Headphones** was the category with most sales followed closely with Computer and Accessories while the least sales were made in the Category Security & Surveillance.
* There has been a steady rise in sales from 2007 to 2015 and a sharp decline from 2016 to 2018.
* The brand name **Bose** sold the most followed by Logitech.
* The brand **Archos** sold the least followed closely with EINCAR..
* Most products were rated **5**.
* Best rated brands were **Savage** and **Plemo**.
