In [31]:
import pandas as pd
import numpy as np
from collections import Counter


## Data Set Information
This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.

---

## Attribute Information
- **InvoiceNo**: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'C', it indicates a cancellation.
- **StockCode**: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
- **Description**: Product (item) name. Nominal.
- **Quantity**: The quantity of each product (item) per transaction. Numeric.
- **InvoiceDate**: Invoice date and time. Numeric, representing the day and time when each transaction was generated.
- **UnitPrice**: Unit price. Numeric, representing the product price per unit in sterling.
- **CustomerID**: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
- **Country**: Country name. Nominal, representing the name of the country where each customer resides.


In [7]:
raw_data = pd.read_csv('../data/retail.csv', encoding='ISO-8859-1')
display(raw_data.head(10))

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/10 8:26,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/10 8:26,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/10 8:28,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/10 8:28,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/10 8:34,1.69,13047.0,United Kingdom


In [145]:
items = raw_data.groupby(['StockCode', 'Description']).count()[[]].reset_index()
items.head(20)

Unnamed: 0,StockCode,Description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10080,check
3,10120,DOGGY RUBBER
4,10123C,HEARTS WRAPPING TAPE
5,10124A,SPOTS ON RED BOOKCOVER TAPE
6,10124G,ARMY CAMO BOOKCOVER TAPE
7,10125,MINI FUNKY DESIGN TAPES
8,10133,COLOURING PENCILS BROWN TUBE
9,10133,damaged


In [146]:
items['is_operational_item'] = (items['StockCode'].str.len() < 5) | (items['StockCode'].str.contains('gift', case=False))
items['stock_code_main'] = items['StockCode'].str.extract(r'^(\d+)', expand=False)
items['stock_code_variant'] = items['StockCode'].str.extract(r'(\D+)$', expand=False).fillna('')
items['product_description'] = np.where(items['StockCode'] == items['stock_code_main'], items['Description'], np.nan)
items['stock_code_variant'] = np.where(items['StockCode'] == items['stock_code_main'],  np.nan, items['stock_code_variant'])

items.head()

Unnamed: 0,StockCode,Description,is_operational_item,stock_code_main,stock_code_variant,product_description
0,10002,INFLATABLE POLITICAL GLOBE,False,10002,,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE,False,10080,,GROOVY CACTUS INFLATABLE
2,10080,check,False,10080,,check
3,10120,DOGGY RUBBER,False,10120,,DOGGY RUBBER
4,10123C,HEARTS WRAPPING TAPE,False,10123,C,


In [148]:
# 1. Filtrar los elementos donde 'operational_item' es False
selected_items = items[items['is_operational_item'] == False]

selected_items['description_len'] = selected_items['Description'].str.len()
selected_items = selected_items.sort_values(by=['StockCode', 'description_len'], ascending=[True, False])
selected_items['description_order'] = selected_items.groupby('StockCode').cumcount() + 1

selected_items.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_items['description_len'] = selected_items['Description'].str.len()


Unnamed: 0,StockCode,Description,is_operational_item,stock_code_main,stock_code_variant,product_description,description_len,description_order
0,10002,INFLATABLE POLITICAL GLOBE,False,10002,,INFLATABLE POLITICAL GLOBE,27,1
1,10080,GROOVY CACTUS INFLATABLE,False,10080,,GROOVY CACTUS INFLATABLE,24,1
2,10080,check,False,10080,,check,5,2
3,10120,DOGGY RUBBER,False,10120,,DOGGY RUBBER,12,1
4,10123C,HEARTS WRAPPING TAPE,False,10123,C,,21,1


In [150]:
weird_items = selected_items[(selected_items['description_order'] > 1) & (selected_items['description_len'] < 13)]
description_list = weird_items['Description'].drop_duplicates().tolist()

is_error_item = [
    'check', 'damaged', 'wet/rusty', 'found', '?', 'damages', '??', 
    'wet/mouldy', 'damages?', 'wet', 'wet rusty', '???', 'broken', 
    'mixed up', '?missing', 'smashed', 'missing', 'faulty', "can't find", 
    '?lost', 'wrong code?', 'wet boxes', '?? missing', 'missing?', 
    'lost', 'wrong code', 'water damage', 'crushed', 'breakages', 
    'mouldy', 'sale error', 'thrown away', 'counted', 'found box', 
    '???missing', 'wet pallet', '????missing', 'crushed ctn', 'cracked', 
    'dagamed', 'returned', 'wet damaged', 'wet?', 'damages wax', 'label mix up', 
    'check?', 'lost??', 'stock check'
]

is_special_item = [
    'dotcom', 'mia', 'showroom', 'sold as 1', 'john lewis', 
    'amazon', 'amazon sales', 'cordial jug', 'fba', 'ebay', 
    'dotcomstock', 'michel oops', 'dotcom sales', 'dotcom set'
]

is_modification_item = [
    'adjustment', 'show samples', 'samples', 'given away', 
    'test', 'sold in set?', 'taig adjust', 'mailout', 'mailout ', 
    'thrown away.', 'adjust', 'label mix up'
]

In [None]:
items['Description'] = items['Description'].str.lower()





error: nothing to repeat at position 173

In [127]:
filtered_items = items[(items['error_item'] == False) & (items['special_item'] == False)].sort_values(by='Description', key=lambda x: x.str.len(), ascending=True)
pd.options.display.max_rows = None
display(filtered_items.head(79))


Unnamed: 0,StockCode,Description,operational_item,stock_code_main,stock_code_variant,product_description,error_item,spacial_item,special_item
2682,23343,20713,False,23343.0,,20713,False,False,False
3861,84763,display,False,84763.0,,display,False,False,False
3676,84422,dagamed,False,84422.0,,dagamed,False,False,False
4765,C2,carriage,True,,,,False,False,False
3353,62018,sombrero,False,62018.0,,sombrero,False,False,False
1057,21888,bingo set,False,21888.0,,bingo set,False,False,False
380,21026,space owl,False,21026.0,,space owl,False,False,False
876,21682,?display?,False,21682.0,,?display?,False,False,False
2393,23118,breakages,False,23118.0,,breakages,False,False,False
1085,21915,john lewis,False,21915.0,,john lewis,False,False,False


In [131]:
items[items['operational_item']]

Unnamed: 0,StockCode,Description,operational_item,stock_code_main,stock_code_variant,product_description,error_item,spacial_item,special_item
4763,B,adjust bad debt,True,,B,,False,False,False
4765,C2,carriage,True,,,,False,False,False
4766,CRUK,cruk commission,True,,CRUK,,False,False,False
4767,D,discount,True,,D,,False,False,True
4780,DOT,dotcom postage,True,,DOT,,False,False,False
4781,M,manual,True,,M,,True,False,False
4782,PADS,pads to match all cushions,True,,PADS,,False,False,False
4783,POST,postage,True,,POST,,False,False,True
4784,S,samples,True,,S,,False,False,True
4785,gift_0001_10,dotcomgiftshop gift voucher £10.00,True,,,,False,False,False


In [32]:
grouped = df.groupby('StockCodeMain')['Description'].apply(lambda x: ' '.join(x))

# Crear una función para obtener la palabra más frecuente en cada grupo
def get_most_common_words(text):
    words = text.split()  # Dividir el texto en palabras
    most_common = Counter(words).most_common(1)  # Obtener la palabra más frecuente
    return most_common[0][0] if most_common else None

product_descriptions = grouped.apply(get_most_common_words).reset_index()
product_descriptions.columns = ['StockCodeMain', 'product_description']

# Unir el resultado original con las nuevas descripciones
df = df.merge(product_descriptions, on='StockCodeMain', how='left')

In [37]:
print(grouped)

StockCodeMain
               Adjust bad debt Discount Manual SAMPLES Manual
10123                                   HEARTS WRAPPING TAPE 
10124       SPOTS ON RED BOOKCOVER TAPE ARMY CAMO BOOKCOVE...
15044       PINK PAPER PARASOL  BLUE PAPER PARASOL  PURPLE...
15056       EDWARDIAN PARASOL NATURAL EDWARDIAN PARASOL PI...
                                  ...                        
DCGSSBO                                        BOYS PARTY BAG
DCGSSGIR                                      GIRLS PARTY BAG
DO                                             DOTCOM POSTAGE
PAD                                PADS TO MATCH ALL CUSHIONS
POS                                                   POSTAGE
Name: Description, Length: 436, dtype: object
