In [226]:
import pandas as pd
import numpy as np
from collections import Counter


## Data Set Information
This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.

---

## Attribute Information
- **InvoiceNo**: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'C', it indicates a cancellation.
- **StockCode**: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
- **Description**: Product (item) name. Nominal.
- **Quantity**: The quantity of each product (item) per transaction. Numeric.
- **InvoiceDate**: Invoice date and time. Numeric, representing the day and time when each transaction was generated.
- **UnitPrice**: Unit price. Numeric, representing the product price per unit in sterling.
- **CustomerID**: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
- **Country**: Country name. Nominal, representing the name of the country where each customer resides.


In [227]:
raw_data = pd.read_csv('../data/retail.csv', encoding='ISO-8859-1')
display(raw_data.head(10))

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/10 8:26,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/10 8:26,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/10 8:28,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/10 8:28,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/10 8:34,1.69,13047.0,United Kingdom


In [246]:
transactions = raw_data.copy()

transactions = transactions.rename(columns={
    'InvoiceNo': 'invoice_id',
    'StockCode': 'item_id',
    'Description': 'item_description',
    'Quantity': 'quantity_amount',
    'InvoiceDate': 'event_timestamp_invoiced_at',
    'UnitPrice': 'unit_price_eur',
    'CustomerID': 'customer_id',
    'Country': 'country_name'
})

transactions['invoice_id'] = transactions['invoice_id'].astype(str)
transactions['item_id'] = transactions['item_id'].astype(str)
transactions['item_description'] = transactions['item_description'].astype(str)
transactions['quantity_amount'] = transactions['quantity_amount'].astype(int)
transactions['event_timestamp_invoiced_at'] = pd.to_datetime(transactions['event_timestamp_invoiced_at'])
transactions['unit_price_eur'] = transactions['unit_price_eur'].astype(float)
transactions['customer_id'] = transactions['customer_id'].astype('Int64')
transactions['country_name'] = transactions['country_name'].astype(str)
transactions.index.name = 'transaction_id'
transactions.reset_index(inplace=True)
transactions['transaction_id'] = transactions['transaction_id'].astype('Int64')


  transactions['event_timestamp_invoiced_at'] = pd.to_datetime(transactions['event_timestamp_invoiced_at'])


In [266]:
transactions.head(10)

Unnamed: 0,transaction_id,invoice_id,item_id,item_description,quantity_amount,event_timestamp_invoiced_at,unit_price_eur,customer_id,country_name
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
5,5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850,United Kingdom
6,6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850,United Kingdom
7,7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850,United Kingdom
8,8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850,United Kingdom
9,9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047,United Kingdom


In [267]:
transactions['is_return'] = transactions['quantity_amount'] < 0

In [268]:
items = transactions.groupby(['item_id', 'item_description']).count()[[]].reset_index()
items.sample(20)

Unnamed: 0,item_id,item_description
766,21288,
552,21084,SET/6 COLLAGE PAPER CUPS
3890,47578A,ENGLISH ROSE SMALL SCENTED FLOWER
1074,21634,ASSORTED MINI MADRAS NOTEBOOK
5098,85123A,?
1368,21891,
1446,21982,PACK OF 12 SUKI TISSUES
2616,22944,CHRISTMAS METAL POSTCARD WITH BELLS
573,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR
3353,23499,SET 12 VINTAGE DOILY CHALK


In [269]:
items['is_operational_item'] = (items['item_id'].str.len() < 5) | (items['item_id'].str.contains('gift', case=False))
items['item_family_id'] = items['item_id'].str.extract(r'^(\d+)', expand=False)
items['item_variant'] = items['item_id'].str.extract(r'(\D+)$', expand=False).fillna('')
items['item_variant'] = np.where(items['item_id'] == items['item_family_id'],  np.nan, items['item_variant'])

items.head()

Unnamed: 0,item_id,item_description,is_operational_item,item_family_id,item_variant
0,10002,INFLATABLE POLITICAL GLOBE,False,10002,
1,10002,,False,10002,
2,10080,GROOVY CACTUS INFLATABLE,False,10080,
3,10080,check,False,10080,
4,10080,,False,10080,


In [270]:
selected_items = items[items['is_operational_item'] == False]

selected_items['is_all_uppercase'] = selected_items['item_description'].str.isupper()
selected_items['description_len'] = selected_items['item_description'].str.len()
selected_items = selected_items.sort_values(by=['item_id', 'description_len'], ascending=[True, False])
selected_items['description_order'] = selected_items.groupby('item_id').cumcount() + 1

selected_items.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_items['is_all_uppercase'] = selected_items['item_description'].str.isupper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_items['description_len'] = selected_items['item_description'].str.len()


Unnamed: 0,item_id,item_description,is_operational_item,item_family_id,item_variant,is_all_uppercase,description_len,description_order
0,10002,INFLATABLE POLITICAL GLOBE,False,10002,,True,27,1
1,10002,,False,10002,,False,3,2
2,10080,GROOVY CACTUS INFLATABLE,False,10080,,True,24,1
3,10080,check,False,10080,,False,5,2
4,10080,,False,10080,,False,3,3


In [278]:
weird_items = selected_items[((selected_items['description_order'] > 1) & (selected_items['description_len'] < 13)) | ~selected_items['is_all_uppercase']]
description_list = weird_items['item_description'].drop_duplicates().tolist()
print(description_list)

is_unknown_item = []
is_error_item = []
is_special_item = []
is_modification_item = []

for description in description_list:
    description_lower = description.lower()
    if any(word in description_lower for word in [
        'check', 'damage', 'wet', 'lost', 'found', 'error', 'wrong', 'faulty', 'mouldy', 'smashed', 'broken', 'crush', 'crack', 'unsaleable', 'missing', 'throw away'
    ]):
        is_error_item.append(description)
    elif any(word in description_lower for word in [
        'dotcom', 'amazon', 'fba', 'ebay', 'john lewis', 'showroom', 'voucher', 'cordial jug', 'website'
    ]):
        is_special_item.append(description)
    elif any(word in description_lower for word in [
        'adjust', 'sample', 'sold as', 'mailout', 'allocate', 'temp', 'credit', 'sale', 're-adjustment', 'label mix up'
    ]):
        is_modification_item.append(description)


is_unknown_item = ['?','??','???','Incorrect stock entry.',"can't find",'nan',None]

is_error_item += [
    'on cargo order', 'test', 'barcode problem', 'mixed up', 'michel oops', 'printing smudges/thrown away', 
    'had been put aside', 'rusty thrown away', 'incorrectly made-thrown away.', 'Breakages', 'counted', 
    'Had been put aside.', 'returned', 'thrown away', 'mystery! Only ever imported 1800', 'Dagamed', 
    'code mix up? 84930', 'Printing smudges/thrown away', 'came coded as 20713', 'incorrect stock entry.',
    "thrown away-can't sell",'Thrown away-rusty','Thrown away.','Given away','historic computer difference?....se',
    'alan hodge cant mamage this section',"thrown away-can't sell.", 'label mix up','sold in set?','mix up with c'
]

is_special_item += [
    'MIA', '?display?', 'Amazon Adjustment', 'Lighthouse Trading zero invc incorr', 
    'Dotcomgiftshop Gift Voucher £100.00', 'sold as set?', 
    'High Resolution Image', 'John Lewis','Bank Charges','Next Day Carriage'
]

is_modification_item += [
    'Adjustment', 'OOPS ! adjustment', 'reverse 21/5/10 adjustment', 'reverse previous adjustment', 
    'marked as 23343', 'incorrectly put back into stock', 'Not rcvd in 10/11/2010 delivery', 'Display', 
    'Had been put aside.',  'sold as set by dotcom', 'add stock to allocate online orders', 
    'allocate stock for dotcom orders ta', 'for online retail orders', 'Marked as 23343'
]

unclassified_items = set(description_list) - set(is_error_item) - set(is_special_item) - set(is_modification_item)

# Mostrar resultados
print("is_error_item:", is_error_item)
print("is_special_item:", is_special_item)
print("is_modification_item:", is_modification_item)
print("Unclassified items:", unclassified_items)


['nan', 'check', 'damaged', 'wet/rusty', 'FLOWERS HANDBAG blue and orange', 'alan hodge cant mamage this section', 'THE KING GIFT BAG 25x24x12cm', 'found', 'Adjustment', 'ESSENTIAL BALM 3.5g TIN IN ENVELOPE', 'adjustment', 'dotcom', 'MIA', '?', 'lost in space', 'wrongly marked. 23343 in box', 'wrongly marked 23343', 'wrongly coded 23343', 'wrongly coded-23343', 'Marked as 23343', 'Found', 'damages', '??', 'damages/display', '*USB Office Mirror Ball', 'POLYESTER FILLER PAD 60x40cm', 'Amazon Adjustment', 'taig adjust no stock', 'WET/MOULDY', 'had been put aside', 'damages?', 'Damaged', 'wet', '*Boombox Ipod Classic', 'sold as set on dotcom', 'wet rusty', 'on cargo order', 'wrongly marked', '???', 'broken', 'Show Samples', 'Sold as 1 on dotcom', "Dr. Jam's Arouzer Stress Ball", "Dad's Cab Electronic Meter", 'CHECK', 'mystery! Only ever imported 1800', 'rcvd be air temp fix for dotcom sit', 're dotcom quick fix.', 'samples', '?display?', 'mixed up', 'wrong barcode', 'sold as set on dotcom 

In [282]:
items['is_unknown_item'] = items['item_description'].str.lower().isin(is_unknown_item)
items['is_error_item'] = items['item_description'].str.lower().isin(is_error_item)
items['is_special_item'] = items['item_description'].str.lower().isin(is_special_item)
items['is_modification_item'] = items['item_description'].str.lower().isin(is_modification_item)

In [283]:
items.head(10)

Unnamed: 0,item_id,item_description,is_operational_item,item_family_id,item_variant,is_error_item,is_special_item,is_modification_item,is_unknown_item
0,10002,INFLATABLE POLITICAL GLOBE,False,10002,,False,False,False,False
1,10002,,False,10002,,False,False,False,True
2,10080,GROOVY CACTUS INFLATABLE,False,10080,,False,False,False,False
3,10080,check,False,10080,,True,False,False,False
4,10080,,False,10080,,False,False,False,True
5,10120,DOGGY RUBBER,False,10120,,False,False,False,False
6,10123C,HEARTS WRAPPING TAPE,False,10123,C,False,False,False,False
7,10123C,,False,10123,C,False,False,False,True
8,10123G,,False,10123,G,False,False,False,True
9,10124A,SPOTS ON RED BOOKCOVER TAPE,False,10124,A,False,False,False,False


In [296]:
filtered_items = items[~items['is_operational_item'] & ~items['is_error_item'] & ~items['is_special_item'] & ~items['is_modification_item']& ~items['is_unknown_item']]
grouped = filtered_items.groupby('item_family_id')['item_description'].apply(lambda x: ' '.join(x))

def get_most_common_words(text):
    words = text.split()  
    word_counts = Counter(words)
    max_count = max(word_counts.values(), default=0)
    
    most_common_words = [word for word, count in word_counts.items() if count == max_count]
    return ' '.join(most_common_words) if most_common_words else None

main_product_descriptions = grouped.apply(get_most_common_words).reset_index()
main_product_descriptions.columns = ['item_family', 'item_family_description']


main_product_descriptions['item_family_description'][:1000].tolist()

['INFLATABLE POLITICAL GLOBE',
 'GROOVY CACTUS INFLATABLE',
 'DOGGY RUBBER',
 'HEARTS WRAPPING TAPE',
 'BOOKCOVER TAPE',
 'MINI FUNKY DESIGN TAPES',
 'COLOURING PENCILS BROWN TUBE',
 'COLOURING PENCILS BROWN TUBE',
 'ASSTD DESIGN RACING CAR PEN',
 'FAN BLACK FRAME',
 'PAPER POCKET TRAVELING FAN',
 'ASSORTED COLOURS SILK FAN',
 'SANDALWOOD FAN',
 'PAPER PARASOL',
 'EDWARDIAN PARASOL',
 'GARDEN PARASOL',
 'FAIRY CAKE DESIGN UMBRELLA',
 'SMALL FOLDING SCISSOR(POINTED EDGE)',
 'FOLDING CAMPING SCISSOR W/KNIF & S',
 'ANIMAL STICKERS',
 'FOOD/DRINK SPONGE STICKERS',
 'SMALL CHINESE STYLE SCISSOR',
 'MEDIUM CHINESE STYLE SCISSOR',
 'LARGE CHINESE STYLE SCISSOR',
 'CLEAR STATIONERY BOX SET',
 'MINI HIGHLIGHTER PENS',
 'POP ART PUSH DOWN RUBBER',
 'POPART WOODEN PENCILS ASST',
 'TEATIME PEN CASE & PENS',
 'TEATIME ROUND PENCIL SHARPENER',
 'TEATIME GEL PENS ASST',
 'TEATIME PUSH DOWN RUBBER',
 'POPART RECT PENCIL SHARPENER ASST',
 'FLOWERS HANDBAG blue and orange',
 'WRAP CAROUSEL',
 'WRAP',
 '

In [None]:
categories = [
    "HOME DECOR",
    "KITCHENWARE",
    "TABLEWARE",
    "GIFT ITEMS",
    "FURNITURE",
    "TOYS",
    "CANDLES",
    "JEWELRY",
    "TEXTILES",
    "GARDENING",
    "BATHROOM",
    "STATIONERY",
    "BAGS",
    "LIGHTING",
    "CRAFTS",
    "HOLIDAY",
    "OUTDOOR",
    "TOOL & DIY",
    "SPORTS",
    "STORAGE",
    "TRAVEL",
    "BEAUTY",
    "BOOKS",
    "GAMES",
    "ORGANIZERS",
    "TABLE LINEN",
    "CHILDREN",
    "CERAMICS",
    "CLOTHING",
    "PLANTS & FLOWERS",
    "OTHERS"
]


#### dash ideas

- beneficio mes
- beneficio año
- mom
- yoy

- numero de compras
- numero de clientes
- beneficio total

- por pais

- por categoria de producto

- numero de invoices realizados
- numero de objetos comprados totals
- devoluciones
- flags especiales

- tipos de clientes:
    - gasto
    - numero de invoices realizados
    - numero de objetos comprados totals
    - numero de objetos por compra
    - numero de compras por mes
    - numero de compras por año
    - devoluciones
    - flags especiales

    si lo consigo:
    - categoria que mas compra