## 1. Calculate TF-IDF

#### Step 1. Import necessary libraries and load data

In [18]:
import pandas as pd
import numpy as np
import re
from math import log

# Load data
df = pd.read_csv('products_categorization_to_assess.csv')
df['true_label'] = df['true_label'].str.lower()
df.head()


Unnamed: 0,product_name,true_label,predicted_label
0,frozen for freshness button sprouts,grocery,clothing
1,chomp chocolate bar,grocery,clothing
2,20 breaded chicken nuggets,grocery,home and kitchen
3,fingers chocolate biscuit bar 9 pack,grocery,computer and electronics
4,garlic & herb oven baked croutons,grocery,grocery


#### Step 2: Data Cleaning and Tokenization

In [19]:
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove short words
    tokens = [word for word in text.split() if len(word) > 2]
    return tokens

df['tokens'] = df['product_name'].apply(preprocess_text)
df.head()


Unnamed: 0,product_name,true_label,predicted_label,tokens
0,frozen for freshness button sprouts,grocery,clothing,"[frozen, for, freshness, button, sprouts]"
1,chomp chocolate bar,grocery,clothing,"[chomp, chocolate, bar]"
2,20 breaded chicken nuggets,grocery,home and kitchen,"[breaded, chicken, nuggets]"
3,fingers chocolate biscuit bar 9 pack,grocery,computer and electronics,"[fingers, chocolate, biscuit, bar, pack]"
4,garlic & herb oven baked croutons,grocery,grocery,"[garlic, herb, oven, baked, croutons]"


#### Step 3: Compute Term Frequency (TF)

In [20]:
from collections import defaultdict

def compute_tf(tokens):
    tf_dict = defaultdict(int)
    for token in tokens:
        tf_dict[token] += 1
    for token in tf_dict:
        tf_dict[token] = tf_dict[token] / len(tokens)
    return tf_dict

df['tf'] = df['tokens'].apply(compute_tf)
df.head()

Unnamed: 0,product_name,true_label,predicted_label,tokens,tf
0,frozen for freshness button sprouts,grocery,clothing,"[frozen, for, freshness, button, sprouts]","{'frozen': 0.2, 'for': 0.2, 'freshness': 0.2, ..."
1,chomp chocolate bar,grocery,clothing,"[chomp, chocolate, bar]","{'chomp': 0.3333333333333333, 'chocolate': 0.3..."
2,20 breaded chicken nuggets,grocery,home and kitchen,"[breaded, chicken, nuggets]","{'breaded': 0.3333333333333333, 'chicken': 0.3..."
3,fingers chocolate biscuit bar 9 pack,grocery,computer and electronics,"[fingers, chocolate, biscuit, bar, pack]","{'fingers': 0.2, 'chocolate': 0.2, 'biscuit': ..."
4,garlic & herb oven baked croutons,grocery,grocery,"[garlic, herb, oven, baked, croutons]","{'garlic': 0.2, 'herb': 0.2, 'oven': 0.2, 'bak..."


#### Step 4: Compute Inverse Document Frequency (IDF)

In [21]:
def compute_idf(df):
    N = len(df)
    idf_dict = defaultdict(int)
    for tokens in df['tokens']:
        for token in set(tokens):
            idf_dict[token] += 1
    for token in idf_dict:
        idf_dict[token] = log(N / idf_dict[token])
    return idf_dict

idf_dict = compute_idf(df)
idf_dict

defaultdict(int,
            {'sprouts': 8.517193191416238,
             'button': 4.733003557497976,
             'freshness': 8.517193191416238,
             'for': 2.0039630805039303,
             'frozen': 5.572754212249797,
             'chomp': 8.517193191416238,
             'chocolate': 4.226733750267846,
             'bar': 5.426150738057921,
             'nuggets': 7.824046010856292,
             'breaded': 6.437751649736401,
             'chicken': 4.803621124711929,
             'fingers': 7.1308988302963465,
             'biscuit': 6.725433722188183,
             'pack': 2.8134107167600364,
             'garlic': 5.683979847360021,
             'baked': 6.907755278982137,
             'croutons': 7.824046010856292,
             'herb': 6.437751649736401,
             'oven': 6.214608098422191,
             'swirl': 7.418580902748128,
             'cheese': 4.733003557497976,
             'soft': 4.147745338949216,
             'tablets': 6.437751649736401,
             'or

#### Step 5: Compute TF-IDF

In [22]:
def compute_tf_idf(tf_dict, idf_dict):
    tf_idf_dict = {}
    for token, tf_value in tf_dict.items():
        tf_idf_dict[token] = tf_value * idf_dict[token]
    return tf_idf_dict

df['tf_idf'] = df['tf'].apply(lambda tf: compute_tf_idf(tf, idf_dict))
df.head()

Unnamed: 0,product_name,true_label,predicted_label,tokens,tf,tf_idf
0,frozen for freshness button sprouts,grocery,clothing,"[frozen, for, freshness, button, sprouts]","{'frozen': 0.2, 'for': 0.2, 'freshness': 0.2, ...","{'frozen': 1.1145508424499595, 'for': 0.400792..."
1,chomp chocolate bar,grocery,clothing,"[chomp, chocolate, bar]","{'chomp': 0.3333333333333333, 'chocolate': 0.3...","{'chomp': 2.839064397138746, 'chocolate': 1.40..."
2,20 breaded chicken nuggets,grocery,home and kitchen,"[breaded, chicken, nuggets]","{'breaded': 0.3333333333333333, 'chicken': 0.3...","{'breaded': 2.1459172165788, 'chicken': 1.6012..."
3,fingers chocolate biscuit bar 9 pack,grocery,computer and electronics,"[fingers, chocolate, biscuit, bar, pack]","{'fingers': 0.2, 'chocolate': 0.2, 'biscuit': ...","{'fingers': 1.4261797660592694, 'chocolate': 0..."
4,garlic & herb oven baked croutons,grocery,grocery,"[garlic, herb, oven, baked, croutons]","{'garlic': 0.2, 'herb': 0.2, 'oven': 0.2, 'bak...","{'garlic': 1.1367959694720042, 'herb': 1.28755..."


#### Step 6: Display the TF, IDF, and TF-IDF Matrices

In [23]:
# Get unique terms
unique_terms = set()
for tokens in df['tokens']:
    unique_terms.update(tokens)
unique_terms = sorted(unique_terms)

# TF Matrix
tf_matrix = []
for tf_dict in df['tf']:
    tf_vector = [tf_dict.get(term, 0) for term in unique_terms]
    tf_matrix.append(tf_vector)
tf_df = pd.DataFrame(tf_matrix, columns=unique_terms)

# IDF Vector
idf_vector = [idf_dict[term] for term in unique_terms]
idf_df = pd.DataFrame([idf_vector], columns=unique_terms)

# TF-IDF Matrix
tf_idf_matrix = []
for tf_idf_dict in df['tf_idf']:
    tf_idf_vector = [tf_idf_dict.get(term, 0) for term in unique_terms]
    tf_idf_matrix.append(tf_idf_vector)
tf_idf_df = pd.DataFrame(tf_idf_matrix, columns=unique_terms)

# Display matrices
print("TF Matrix")
print(tf_df)
print("\nIDF Vector")
print(idf_df)
print("\nTF-IDF Matrix")
print(tf_idf_df)


TF Matrix
      _years  aaa  aab  aar  abba  abbey  abdomen  abercombie  abercrombie  \
0        0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
1        0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
2        0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
3        0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
4        0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
...      ...  ...  ...  ...   ...    ...      ...         ...          ...   
4995     0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
4996     0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
4997     0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
4998     0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   
4999     0.0  0.0  0.0  0.0   0.0    0.0      0.0         0.0          0.0   

      aberdeen  ...  zolische  zombicide  zombies  zo

# 2. Compute the global accuracy

In [26]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score

df = pd.read_csv('products_categorization_to_assess.csv')

# Preprocess labels
df['true_label'] = df['true_label'].str.lower().str.split(',')
df['predicted_label'] = df['predicted_label'].str.lower().str.split(',')

# Convert multilabel to binary format
mlb = MultiLabelBinarizer()
true_labels = mlb.fit_transform(df['true_label'])
predicted_labels = mlb.transform(df['predicted_label'])

# Compute accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Global Accuracy: {:.2f}%".format(accuracy * 100))


Global Accuracy: 26.66%


# 3. For each category define precision / recall / f1 score

In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Load data
df = pd.read_csv('products_categorization_to_assess.csv')

# Preprocess labels
df['true_label'] = df['true_label'].str.lower().str.split(',')
df['predicted_label'] = df['predicted_label'].str.lower().str.split(',')

# Convert multilabel to binary format
mlb = MultiLabelBinarizer()
true_labels = mlb.fit_transform(df['true_label'])
predicted_labels = mlb.transform(df['predicted_label'])

# Calculate precision, recall, and f1 score for each category
precision = precision_score(true_labels, predicted_labels, average=None)
recall = recall_score(true_labels, predicted_labels, average=None)
f1 = f1_score(true_labels, predicted_labels, average=None)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Category': mlb.classes_,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

print(results_df)


                   Category  Precision    Recall  F1 Score
0                  clothing   0.374256  0.368477  0.371344
1  computer and electronics   0.179742  0.185261  0.182460
2                   grocery   0.227273  0.228672  0.227970
3          home and kitchen   0.200000  0.198413  0.199203


# 4. Build a confusion matrix

In [28]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Load data
df = pd.read_csv('products_categorization_to_assess.csv')

# Preprocess labels
df['true_label'] = df['true_label'].str.lower().str.split(',')
df['predicted_label'] = df['predicted_label'].str.lower().str.split(',')

# Convert multilabel to binary format
mlb = MultiLabelBinarizer()
true_labels = mlb.fit_transform(df['true_label'])
predicted_labels = mlb.transform(df['predicted_label'])

# Calculate precision, recall, and f1 score for each category
precision = precision_score(true_labels, predicted_labels, average=None)
recall = recall_score(true_labels, predicted_labels, average=None)
f1 = f1_score(true_labels, predicted_labels, average=None)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Category': mlb.classes_,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

print(results_df)

# Calculate confusion matrix for each category
confusion_matrices = {}

for i, category in enumerate(mlb.classes_):
    cm = confusion_matrix(true_labels[:, i], predicted_labels[:, i])
    confusion_matrices[category] = cm

# Display the confusion matrices
for category, cm in confusion_matrices.items():
    print(f"Confusion Matrix for {category}:")
    print(cm)
    print("\n")


                   Category  Precision    Recall  F1 Score
0                  clothing   0.374256  0.368477  0.371344
1  computer and electronics   0.179742  0.185261  0.182460
2                   grocery   0.227273  0.228672  0.227970
3          home and kitchen   0.200000  0.198413  0.199203
Confusion Matrix for clothing:
[[1965 1157]
 [1186  692]]


Confusion Matrix for computer and electronics:
[[3197  826]
 [ 796  181]]


Confusion Matrix for grocery:
[[2979  884]
 [ 877  260]]


Confusion Matrix for home and kitchen:
[[3192  800]
 [ 808  200]]




# 5. Do you think the classification system is reliable enough ?

##### For Global Accuracy

Global Accuracy = 26.66%, which indicates that the model correctly classifiers only about 27% of the instances. A global accuracy below 50% generally suggests that the model is not performing well.

##### For Precision, Recall and F1 Score

Clothing: Precision = 0.374256; Recall = 0.368477; F1 Score = 0.371344

Computer and Electronics: Precision = 0.179742; Recall = 0.185261; F1 Score = 0.182460

Grocery: Precision = 0.227273; Recall = 0.228672; F1 Score = 0.227970

Home and Kitchen: Precision = 0.200000; Recall = 0.198413; F1 Score = 0.199203

##### Confusion Matrix

Clothing:

True Positive (TP): 692
False Positive (FP): 1157
True Negative (TN): 1965
False Negative (FN): 1186

High number of false positives and false negatives, indicating misclassifications.


Computer and Electronics:

TP: 181
FP: 826
TN: 3197
FN: 796

Extremely high false positives and false negatives.


Grocery:

TP: 260
FP: 884
TN: 2979
FN: 877

High misclassification rates.


Home and Kitchen:

TP: 200
FP: 800
TN: 3192
FN: 808

Similar issues with high misclassification.


##### The low global accuracy, poor precision, recall, and F1 scores across all categories, combined with the high misclassification rates indicated by the confusion matrices, suggest that the current classification system is not reliable enough for practica use.