In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data.csv', encoding = 'ISO-8859-1')
df.head()

In [None]:
def find_best_selling_products(dataframe, n=10):
    products = dataframe.groupby('Description')['Quantity'].sum(numeric_only=False)
    products = products.sort_values(ascending=False)
    return products.head(n)

In [None]:
find_best_selling_products(df)

In [None]:
def find_most_returned_products(dataframe, n=10):
    products = dataframe[dataframe['Quantity'] < 0].groupby('Description')['Quantity'].sum(numeric_only=False)
    products = products.sort_values(ascending=True)
    return products.head(n)

In [None]:
find_most_returned_products(df)

In [None]:
def find_best_customers(dataframe, n=10):
    customers = dataframe.groupby('CustomerID')['Quantity'].sum(numeric_only=False)
    customers = customers.sort_values(ascending=False)
    return customers.head(n)

In [None]:
find_best_customers(df)

In [None]:
def find_most_returned_customers(dataframe, n=10):
    customers = dataframe[dataframe['Quantity'] < 0].groupby('CustomerID')['Quantity'].sum(numeric_only=False)
    customers = customers.sort_values(ascending=True)
    return customers.head(n)

In [None]:
find_most_returned_customers(df)

In [None]:
def find_best_selling_products_by_country(dataframe, n=10):
    # Only get rows where quantity is greater than zero (to ignore returns)
    df = dataframe[dataframe['Quantity'] > 0]
    countries_products = df.groupby(['Country','Description'])[['Quantity']].sum(numeric_only=False).reset_index()
    countries_products = countries_products.sort_values(by=['Country','Quantity'],ascending=[True,False])
    products_by_country = {}
    for country, df_country in countries_products.groupby('Country'):
        products = df_country.head(n)["Description"].tolist()
        products_by_country[country] = products
    return products_by_country

In [None]:
find_best_selling_products_by_country(df)

In [None]:
def find_similar_products_countries(dataframe, n=10):
    # Create a dataframe with the top n products in the United Kingdom
    df_uk = dataframe[dataframe['Country'] == 'United Kingdom']
    uk_products = df_uk.groupby('Description')[['Quantity']].sum(numeric_only=False).nlargest(n, 'Quantity')
    uk_products = uk_products.index.tolist()

    # Group the original dataframe by country and get the top n products for each country
    countries_products = dataframe.groupby(['Country','Description'])[['Quantity']].sum(numeric_only=False).reset_index()
    countries_products = countries_products.sort_values(by=['Country','Quantity'],ascending=[True,False])
    countries_products = countries_products.groupby('Country').head(n)
    similar_countries = {}

    # Iterate over the countries and compare the top n products to the UK products
    for country, df_country in countries_products.groupby('Country'):
        country_products = df_country['Description'].tolist()
        common_products = set(country_products).intersection(uk_products)
        if len(common_products) > 0:
            similar_countries[country] = common_products
    return similar_countries

In [None]:
find_similar_products_countries(df)

In [None]:
def product_price_variation_percentage(dataframe, start_date, end_date, product_name=None):
    # Filter dataframe by date
    df = dataframe[(dataframe['InvoiceDate'] >= start_date) & (dataframe['InvoiceDate'] <= end_date)]
    # Filter dataframe by product name
    if product_name:
        df = df[df['Description'] == product_name]
    # Group by product and sum the total price
    df = df.copy()
    df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
    price_variation = df.groupby(["Description"])["TotalPrice"].sum(numeric_only=False).to_frame()
    #compute the percentage change
    price_variation["PriceVariationPercentage"] = (price_variation["TotalPrice"] - price_variation["TotalPrice"].shift(1))/price_variation["TotalPrice"].shift(1)
    price_variation["PriceVariationPercentage"] = price_variation["PriceVariationPercentage"].replace([np.inf, -np.inf], np.nan).fillna(0)
    return price_variation.sort_values(by="PriceVariationPercentage", ascending=False)

In [None]:
product_price_variation_percentage(df, '2011-01-01', '2011-12-01')

In [None]:
def product_sales_variation_percentage(dataframe, start_date, end_date, product_name=None):
    # Filter dataframe by date
    df = dataframe[(dataframe['InvoiceDate'] >= start_date) & (dataframe['InvoiceDate'] <= end_date)]
    # Filter dataframe by product name
    if product_name:
        df = df[df['Description'] == product_name]
    # Group by product and sum the quantity
    df = df.copy()
    sales_total = df.groupby(["Description"])["Quantity"].sum().to_frame()
    #compute the percentage change
    sales_total["SalesVariationPercentage"] = sales_total["Quantity"] / sales_total["Quantity"].sum()
    return sales_total.sort_values(by="SalesVariationPercentage", ascending=False)

In [None]:
product_sales_variation_percentage(df, '2011-01-01', '2011-12-31')