<font size="5"> Importing Libraries</font>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np



<font size="5">Reading CSV file </font>

In [None]:
df= pd.read_csv('/kaggle/input/jetson-sample-datacsv/jetson-sample-data.csv')
df.head()

<font size="5"> Reading the database</font>

In [None]:
df.columns

In [None]:
df.shape

<font size="5">Detecting null values </font>

In [None]:
df.isna().sum()

In [None]:
df.describe()

<font size="5">Dropping incorrect prices </font>

In [None]:
indexNames = df[(df['price']=<0) ].index
df.drop(indexNames , inplace=True)

In [None]:
df.describe()

<font size="5">Detecting outliers </font>



Case1: When the data is skewed (left/right)

In this case, we deal with imbalanced dataset. We use IQR to find the “upper” and “lower” boundaries, outside which all the data points are considered as outliers.

upper = Third quartile(Q3) + 1.5 * IQR

lower = First quartile(Q1) – 1.5 * IQR

Case 2: When the data is Normal/Gaussian Distributed

This is the case when we use standard deviation to find the outliers. The data is normally distributed and we unnecessarily don’t use IQR everywhere to deal with outliers. In this method, we calculate the “upper” and “lower” boundaries, out of which all the data points are considered as outliers. Outlier Detection

upper = mean + 3 * standard deviation

lower = mean – 3 * standard deviation


In [None]:
plt.boxplot(df.price, whis=1.5)

plt.show()

<font size="5"> Treating outliers </font>


In [None]:
p75 = np.percentile(df.price, 75)
p25 = np.percentile(df.price, 25)
iqr = p75-p25
iqr
upper = df.price.mean() + 1.5*iqr
lower = df.price.mean() -1.5*iqr
df['price'] = np.where(
    df['price']>upper,
    upper,
    np.where(
        df['price']<lower,
        lower,
        df['price']))

plt.boxplot(df.price, whis=1.5)

plt.show()

In [None]:
df.info()

<font size="5"> Analysing sales of items </font>

In [None]:
sns.displot(df.item_name, kde=True)
plt.ylim(0,5000)
plt.show()

In [None]:
df2 = pd.value_counts(df['item_name'])

<font size="5">Creating a dataframe sorted by count of items </font>

In [None]:
df2=pd.DataFrame(df2)
df2

<font size="5"> Sorting by date</font>

In [None]:
df.sort_values(by='date')
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

<font size="5"> Plotting top 5 items sold per month </font>


In [None]:
month = int(input("Month"))
months=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
year = int(input("Year"))
mont = df[(df['date'].dt.month == month) & (df['date'].dt.year == year)]

# Group data by item_name and sum up the quantity sold
item_sales = mont.groupby('item_name')['quantity'].sum()

# Sort the item_sales Series in descending order and get the top 5 items
top_items = item_sales.sort_values(ascending=False).head(5)
c = ['#2e4045', '#83adb5', '#c7bbc9', '#5e3c58', 'orange']
# Plot a bar chart of the top 5 items
plt.bar(top_items.index, top_items.values, color = c)
plt.title(f'Top 5 Items Sold in {months[month-1]} {year}')
plt.xlabel('Item Name')
plt.ylabel('Quantity Sold')
plt.rc('xtick', labelsize=6) 
plt.rc('ytick', labelsize=8.5) 
plt.show()
    

<font size="5">Plotting  top 5 items sold in a year </font>

In [None]:
year = int(input("Year"))
mont = df[(df['date'].dt.year == year)]

# Group data by item_name and sum up the quantity sold
item_sales = mont.groupby('item_name')['quantity'].sum()

# Sort the item_sales Series in descending order and get the top 5 items
top_items = item_sales.sort_values(ascending=False).head(5)
c = ['#2e4045', '#83adb5', '#c7bbc9', '#5e3c58', 'orange']
# Plot a bar chart of the top 5 items
plt.bar(top_items.index, top_items.values, color = c)
plt.title(f'Top 5 Items Sold in {year}')
plt.xlabel('Item Name')
plt.ylabel('Quantity Sold')
plt.rc('xtick', labelsize=6) 
plt.rc('ytick', labelsize=7) 
plt.show()
    

<font size="5">Importing more libraries </font>

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

<font size="5">Plotting the most selling bundles </font>

<font size="5"> </font>

In [None]:
item_lists = df.groupby('order_id')['item_name'].apply(list).values.tolist()
numb = 20
# use TransactionEncoder to one-hot encode the item lists
te = TransactionEncoder()
te_ary = te.fit_transform(item_lists)

# convert the one-hot encoded data to a pandas DataFrame
df_onehot = pd.DataFrame(te_ary, columns=te.columns_)

# find frequent itemsets with at least two items
frequent_itemsets = apriori(df_onehot, min_support=0.01, use_colnames=True, max_len=2)

# filter to keep only itemsets with two or more items
bundles = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) >= 2)]

# sort by support and keep only the top 10 bundles
top_bundles = bundles.sort_values(by='support', ascending=False).head(numb)

# plot a bar chart of the top 10 bundles
fig, ax = plt.subplots(figsize=(8, 6))  # Set the width to 10 inches


ax.barh(range(len(top_bundles)), top_bundles['support'],color = ["#fd7f6f", "#7eb0d5", "#b2e061", "#bd7ebe", "#ffb55a", "#ffee65", "#beb9db", "#fdcce5", "#8bd3c7","#1984c5", "#22a7f0", "#63bff0", "#a7d5ed", "#e2e2e2", "#e1a692", "#de6e56", "#e14b31", "#c23728", "orange"])
plt.yticks(range(len(top_bundles)), [' & '.join(list(x)) for x in top_bundles['itemsets']])
plt.xlabel('Support')
plt.ylabel('Bundle')
plt.title(f'Top {numb} Best Selling Bundles')