In [1]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import re
import base64
from pathlib import Path
import os

# =============================================================================
# 1. Configuration & Dictionary
# =============================================================================

# Updated Mapping: Code -> Category Name
CATEGORY_SHORT_NAMES = {
    10: "Livres (G√©n√©ral)",       # General Books / Non-fiction / Misc
    2705: "Romans & litt√©rature", # Fiction / Novels
    2280: "Journaux & magazines",
    2403: "S√©ries & encyclop√©dies", # Comics / Manga / Encyclopedias
    40: "R√©tro Gaming",
    50: "Accessoires JV",
    60: "Consoles",
    2462: "Jeux Vid√©o",
    2905: "Jeux PC",
    1140: "Figurine",
    1160: "Cartes √† collectionner",
    1180: "Jeux de r√¥le",
    1280: "Jouets & Figurines",
    1281: "Jeux de soci√©t√©",
    1300: "Mod√©lisme & Drones",
    1301: "B√©b√© & Jeux",
    1302: "Sport & Loisirs",
    1320: "B√©b√© & Pu√©riculture",
    1560: "√âquipement maison",
    1920: "Textiles",
    2060: "D√©co & √âclairage",
    2582: "Jardinage & d√©co",
    2583: "Piscine",
    2585: "Jardin & Bricolage",
    1940: "Alimentation",
    2220: "Animaux",
    2522: "Fournitures bureau",
}

# Paths configuration
DATA_DIR = Path("/workspace/data")
X_PATH = DATA_DIR / "X_train_update.csv"
Y_PATH = DATA_DIR / "Y_train_CVw08PX.csv"
IMG_DIR = Path("/workspace/data/images/image_train")

# =============================================================================
# 2. Data Loading & Preparation
# =============================================================================

df = pd.DataFrame()

if X_PATH.exists() and Y_PATH.exists():
    print("Loading CSV data...")
    try:
        X = pd.read_csv(X_PATH, index_col=0)
        Y = pd.read_csv(Y_PATH, index_col=0)
        df = X.merge(Y, left_index=True, right_index=True)
        
        # Create category_name column
        df['category_name'] = df['prdtypecode'].apply(lambda x: CATEGORY_SHORT_NAMES.get(x, str(x)))
        
        print(f"Data loaded successfully! df shape: {df.shape}")
    except Exception as e:
        print(f"Error loading CSV files: {e}")
else:
    print(f"Warning: Files not found at {X_PATH} or {Y_PATH}")

# =============================================================================
# 3. Image Path Mapping
# =============================================================================

pat = re.compile(r"image_(\d+)_product_(\d+)\.jpg$")
lookup = {}

if IMG_DIR.exists():
    files = list(IMG_DIR.glob("*.jpg"))
    for p in files:
        m = pat.search(p.name)
        if m:
            lookup[m.group(1)] = p

if not df.empty:
    df["image_path"] = df["imageid"].apply(lambda i: lookup.get(str(int(i))))

# =============================================================================
# 4. Helper Function: Image Display
# =============================================================================

def img_cell(p, size=100):
    if p is None or not Path(p).exists():
        return '<div style="color:gray; font-size:0.8em">No Image</div>'
    try:
        b64 = base64.b64encode(Path(p).read_bytes()).decode("ascii")
        return f'<img src="data:image/jpeg;base64,{b64}" style="width:{size}px; height:auto; max-width:{size}px;" />'
    except Exception as e:
        return f"Error"

# =============================================================================
# 5. Interactive Widget UI
# =============================================================================

if df.empty:
    print("DataFrame is empty. Cannot display interactive UI.")
else:
    unique_codes = sorted(df['prdtypecode'].unique())
    
    # Dropdown Options
    dropdown_options = [
        (f"{code} - {CATEGORY_SHORT_NAMES.get(code, 'Unknown')}", code) 
        for code in unique_codes
    ]

    dropdown_category = widgets.Dropdown(
        options=dropdown_options,
        value=unique_codes[0] if len(unique_codes) > 0 else None,
        description='Category:',
        disabled=False,
        style={'description_width': 'initial'}
    )

    btn_refresh = widgets.Button(
        description='Show Random 20',
        button_style='info', 
        icon='random'
    )

    output_area = widgets.Output()

    def show_samples(b=None):
        with output_area:
            clear_output(wait=True)
            
            selected_code = dropdown_category.value
            selected_name = CATEGORY_SHORT_NAMES.get(selected_code, "Unknown")
            
            subset = df[df['prdtypecode'] == selected_code]
            
            n_samples = min(20, len(subset))
            if n_samples == 0:
                print(f"No data found for category {selected_code}.")
                return
                
            sample_df = subset.sample(n_samples).copy()
            
            display_cols = ["prdtypecode", "category_name", "designation", "description", "image_path"]
            display_cols = [c for c in display_cols if c in sample_df.columns]
            
            view = sample_df[display_cols].copy()
            view["image"] = view["image_path"].apply(lambda p: img_cell(p, size=120))
            
            # Reorder columns
            cols_final = ["image"] + [c for c in display_cols if c != "image_path"]
            view = view[cols_final]
            
            # Generate HTML Table
            html_table = view.to_html(escape=False, index=False)
            
            # 1. Info Header
            header_html = f"""
            <div style="background-color:#f7f7f7; padding:10px; border-radius:5px; margin-bottom:10px; text-align:left;">
                <b>Category:</b> {selected_code} ({selected_name}) &nbsp;|&nbsp; 
                <b>Total Items:</b> {len(subset)} &nbsp;|&nbsp; 
                <b>Showing:</b> {n_samples} random samples
            </div>
            """
            
            # 2. CSS for Left Alignment
            # This forces all table cells (td) and headers (th) to align left
            style_html = """
            <style>
                .dataframe td { text-align: left !important; vertical-align: top !important; }
                .dataframe th { text-align: left !important; }
            </style>
            """
            
            # Display everything
            display(HTML(style_html + header_html + html_table))

    dropdown_category.observe(show_samples, names='value')
    btn_refresh.on_click(show_samples)

    ui = widgets.VBox([
        widgets.HBox([dropdown_category, btn_refresh]),
        output_area
    ])
    
    show_samples()
    display(ui)

Loading CSV data...
Data loaded successfully! df shape: (84916, 6)


VBox(children=(HBox(children=(Dropdown(description='Category:', options=(('10 - Livres (G√©n√©ral)', np.int64(10‚Ä¶

In [3]:
def get_top_ngrams(corpus, ngram_range=(2, 2), top_n=15):
    """
    Calculate the most frequent N-grams in the corpus.
    """
    if len(corpus) == 0:
        return pd.DataFrame()
    
    # min_df=5 filters out phrases appearing less than 5 times to speed up calculation
    vec = CountVectorizer(ngram_range=ngram_range, min_df=5)
    
    try:
        bag_of_words = vec.fit_transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
        
        return pd.DataFrame(words_freq[:top_n], columns=['ngram', 'count'])
    except ValueError:
        # If vocabulary is empty (too little data or all filtered)
        return pd.DataFrame()

# =============================================================================
# 3. Interactive Interface Logic
# =============================================================================

# Check if 'df' exists and has a 'text' column
if 'df' not in locals() or 'text' not in df.columns:
    print("‚ö†Ô∏è Warning: 'df' or 'text' column not found. Please ensure data cleaning and merging are done in previous cells.")
    # Attempt automatic fix (if df exists but no text column)
    if 'df' in locals():
        print("Attempting to create 'text' column from designation + description...")
        df['text'] = df['designation'].fillna('') + ' ' + df['description'].fillna('')
    else:
        print("Cannot proceed.")

# Prepare dropdown menu data
unique_codes = sorted(df['prdtypecode'].unique())
dropdown_options = [
    (f"{code} - {CATEGORY_SHORT_NAMES.get(code, 'Unknown')}", code) 
    for code in unique_codes
]

# Create widgets
dropdown_ngram = widgets.Dropdown(
    options=dropdown_options,
    value=unique_codes[0],
    description='Select Category:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

btn_analyze = widgets.Button(
    description='üìä Analyze N-grams',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    layout=widgets.Layout(width='150px'),
    icon='search'
)

output_ngram = widgets.Output()

def on_click_analyze(b):
    with output_ngram:
        clear_output(wait=True)
        code = dropdown_ngram.value
        cat_name = CATEGORY_SHORT_NAMES.get(code, 'Unknown')
        
        print(f"üîÑ Analyzing Category: {code} ({cat_name})...")
        
        # 1. Extract text for this category
        subset = df[df['prdtypecode'] == code]
        corpus = subset['text'].astype(str).tolist()
        
        # 2. Calculate Bigrams (2 words)
        df_bigram = get_top_ngrams(corpus, ngram_range=(2, 2), top_n=15)
        
        # 3. Calculate Trigrams (3 words)
        df_trigram = get_top_ngrams(corpus, ngram_range=(3, 3), top_n=15)
        
        # 4. Plotting
        if df_bigram.empty and df_trigram.empty:
            print(f"‚ùå Insufficient data for this category to generate N-grams.")
            return

        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Bigram Plot
        if not df_bigram.empty:
            sns.barplot(x='count', y='ngram', data=df_bigram, ax=axes[0], palette='viridis')
            axes[0].set_title(f'Top 15 Bigrams - {cat_name}')
            axes[0].set_xlabel('Frequency')
            axes[0].set_ylabel('')
        
        # Trigram Plot
        if not df_trigram.empty:
            sns.barplot(x='count', y='ngram', data=df_trigram, ax=axes[1], palette='magma')
            axes[1].set_title(f'Top 15 Trigrams - {cat_name}')
            axes[1].set_xlabel('Frequency')
            axes[1].set_ylabel('')
        
        plt.tight_layout()
        plt.show()

# Bind events
btn_analyze.on_click(on_click_analyze)

# Display layout
ui = widgets.VBox([
    widgets.HBox([dropdown_ngram, btn_analyze]),
    output_ngram
])

display(HTML("<h3>üîç View N-grams by Category</h3>"))
display(ui)

# Trigger once automatically to show default results
on_click_analyze(None)

VBox(children=(HBox(children=(Dropdown(description='Select Category:', layout=Layout(width='400px'), options=(‚Ä¶