In [43]:
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
df = pd.read_csv('data.csv', encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [45]:
def find_best_selling_products(dataframe, n=10):
    products = dataframe.groupby('Description').sum(numeric_only=False)['Quantity']
    products = products.sort_values(ascending=False)
    return products.head(n)

In [46]:
def find_most_returned_products(dataframe, n=10):
    products = dataframe[dataframe['Quantity'] < 0].groupby('Description').sum(numeric_only=False)['Quantity']
    products = products.sort_values(ascending=True)
    return products.head(n)

In [47]:
def find_best_customers(dataframe, n=10):
    customers = dataframe.groupby('CustomerID').sum(numeric_only=False)['Quantity']
    customers = customers.sort_values(ascending=False)
    return customers.head(n)

In [48]:
def find_most_returned_customers(dataframe, n=10):
    customers = dataframe[dataframe['Quantity'] < 0].groupby('CustomerID').sum(numeric_only=False)['Quantity']
    customers = customers.sort_values(ascending=True)
    return customers.head(n)

In [49]:
def find_best_selling_products_by_country(dataframe, n=10):
    # Only get rows where quantity is greater than zero (to ignore returns)
    df = dataframe[dataframe['Quantity'] > 0]
    countries_products = df.groupby(['Country','Description']).sum(numeric_only=False).reset_index()
    countries_products = countries_products.sort_values(by=['Country','Quantity'],ascending=[True,False])
    products_by_country = {}
    for country, df_country in countries_products.groupby('Country'):
        products = df_country.head(n)["Description"].tolist()
        products_by_country[country] = products
    return products_by_country

In [50]:
def find_similar_products_countries(dataframe, n=10):
    # Create a dataframe with the top n products in the United Kingdom
    df_uk = dataframe[dataframe['Country'] == 'United Kingdom']
    uk_products = df_uk.groupby('Description').sum(numeric_only=False).nlargest(n, 'Quantity')
    uk_products = uk_products.index.tolist()

    # Group the original dataframe by country and get the top n products for each country
    countries_products = dataframe.groupby(['Country','Description']).sum(numeric_only=False).reset_index()
    countries_products = countries_products.sort_values(by=['Country','Quantity'],ascending=[True,False])
    countries_products = countries_products.groupby('Country').head(n)
    similar_countries = {}

    # Iterate over the countries and compare the top n products to the UK products
    for country, df_country in countries_products.groupby('Country'):
        country_products = df_country['Description'].tolist()
        common_products = set(country_products).intersection(uk_products)
        if len(common_products) > 0:
            similar_countries[country] = common_products
    return similar_countries