# Description SVM

## Libraries

In [1]:
import nltk
import time
import math
import tqdm
import json
import spacy
import requests
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn.svm import SVC
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from wikidata.client import Client
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from sklearn.metrics import classification_report, accuracy_score, f1_score

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/epicmusk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/epicmusk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Collecting the dataset

In [2]:
dir_path   =  "/mnt/c/Users/fede6/Desktop/AI_R/Formazione/MNLP/HW1/"
train_path =  dir_path + "train.csv"
dev_path   =  dir_path + "valid.csv"

train_df = pd.read_csv(train_path, encoding='utf-8')
dev_df = pd.read_csv(dev_path, encoding='utf-8')

## Extraction of the descriptions

In [3]:
train_docs = train_df['description']
dev_docs = dev_df['description']

## TF_IDF computation

### Methods

In [4]:
STOPWORDS = set(stopwords.words('english'))

def tokenize(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in STOPWORDS]
    return tokens

def tf(doc):
    tf_ = {}
    words = tokenize(doc)
    cnt = Counter(words)
    normalization_factor = len(words)
    for word, occs in cnt.items():
        tf_[word] = occs / normalization_factor
        
    return tf_

def idf(texts):
    idf_ = {}
    N = len(texts)
    tokenized_docs = [set(tokenize(doc)) for doc in texts]
    all_words = set(word for doc in tokenized_docs for word in doc)
    
    for word in tqdm.tqdm(all_words):
        doc_cnt = sum(1 for doc in tokenized_docs if word in doc)
        idf_[word] = math.log((1 + N) / (1 + doc_cnt)) + 1
    return idf_

def idf_by_category(df):
    categorical_idf = {}
    grouped = df.groupby('category')['description'].apply(list)

    for cat, docs in grouped.items():
        categorical_idf[cat] = idf(docs)

    return categorical_idf

def tf_idf(tf_, idf_):
    return {word: tf_val * idf_.get(word, 0.0) for word, tf_val in tf_.items()}

def vectorize(tfidf_dict, word_index):
    vec = np.zeros(len(word_index))
    for word, value in tfidf_dict.items():
        if word in word_index:
            vec[word_index[word]] = value
    return vec

### Test TF_IDF

### Classic

In [5]:
idfs = idf(train_docs)
classic_tfidf = []

for doc in tqdm.tqdm(train_docs):
    tf_ = tf(doc)
    tfidf = tf_idf(tf_, idfs)
    classic_tfidf.append(tfidf)

print(classic_tfidf)

100%|████████████████████████████████████████████████████████████████████| 7168/7168 [00:01<00:00, 4325.53it/s]
100%|███████████████████████████████████████████████████████████████████| 6251/6251 [00:00<00:00, 12231.98it/s]

[{'film': 1.9861678478737976, 'mohanan': 4.523754755490711}, {'american': 1.3758428617177656, 'band': 1.882104043106422, 'california': 2.479357199515774}, {'mort': 2.598248847495351, 'phil': 2.553738383287177, 'comic': 1.8167324167976586}, {'american': 2.0637642925766486, 'band': 2.823156064659633}, {'building': 1.7324539697571213, 'monmouth': 2.880681467624419, 'wales': 2.391902444693277}, {'term': 2.0256983484705735, 'city': 1.6892759523598202, 'center': 2.4475871395809987}, {'category': 1.9486866356215136, 'south': 1.4115780323298166, 'pacific': 2.2618773777453556, 'cyclone': 2.2618773777453556}, {'novel': 2.232044751272648, 'james': 2.6496324074377706, 'patterson': 3.015836503660474}, {'attempt': 2.160511100718314, 'assassinate': 2.2618773777453556, 'adolf': 2.032804694776817, 'hitler': 2.032804694776817}, {'american': 0.5896469407361853, 'film': 0.5674765279639421, 'studio': 0.7785996071989965, 'owned': 1.02510104772569, 'walt': 1.1616026827296098, 'disney': 1.1355567460447589, 'c




### Enhanced

In [6]:
idf_cat = idf_by_category(train_df)
enhanced_tfidf = []

for _, row in tqdm.tqdm(train_df.iterrows(), total=len(train_df)):
    doc = row['description']
    cat = row['category']
    enh_tf_ = tf(doc)
    enh_idf_ = idf_cat.get(cat, {})
    enh_tfidf = tf_idf(enh_tf_, enh_idf_)
    enhanced_tfidf.append(enh_tfidf)

print(enhanced_tfidf)

100%|███████████████████████████████████████████████████████████████████| 1001/1001 [00:00<00:00, 84958.89it/s]
100%|█████████████████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 68844.48it/s]
100%|████████████████████████████████████████████████████████████████████| 442/442 [00:00<00:00, 136757.33it/s]
100%|████████████████████████████████████████████████████████████████████| 421/421 [00:00<00:00, 101916.31it/s]
100%|█████████████████████████████████████████████████████████████████████| 846/846 [00:00<00:00, 87545.18it/s]
100%|████████████████████████████████████████████████████████████████████| 469/469 [00:00<00:00, 128823.09it/s]
100%|█████████████████████████████████████████████████████████████████████| 720/720 [00:00<00:00, 86191.71it/s]
100%|█████████████████████████████████████████████████████████████████████| 885/885 [00:00<00:00, 69814.35it/s]
100%|████████████████████████████████████████████████████████████████████| 796/796 [00:00<00:00, 118623.

[{'film': 0.6500522962251691, 'mohanan': 2.952637389219215}, {'american': 1.0905176612487724, 'band': 0.9315429705520026, 'california': 1.8229258536941786}, {'mort': 1.6000976804949467, 'phil': 1.5555872162867725, 'comic': 0.8373655619809646}, {'american': 1.6357764918731585, 'band': 1.397314455828004}, {'building': 0.8470950080656247, 'monmouth': 1.9617746640063602, 'wales': 1.8658806398557666}, {'term': 1.7430879600404903, 'city': 0.8755247315590291, 'center': 1.803861812305142}, {'category': 1.5067911490118666, 'south': 1.2321380768448391, 'pacific': 1.5067911490118666, 'cyclone': 1.5067911490118666}, {'novel': 1.1060837365794816, 'james': 1.7213593000789251, 'patterson': 1.85651433611498}, {'attempt': 1.5067911490118666, 'assassinate': 1.5067911490118666, 'adolf': 1.4054248719848255, 'hitler': 1.4054248719848255}, {'american': 0.41354167073232717, 'film': 0.3919955436273866, 'studio': 0.43892469605196316, 'owned': 0.694890334513114, 'walt': 0.7528139213857088, 'disney': 0.752813921




## Test SVM

## Classic vocabulary

In [7]:
classic_dict = [tf_idf(tf(doc), idfs) for doc in train_df['description']]

In [8]:
classic_vocab = sorted(set(word for doc in train_df['description'] for word in doc.split()))
classic_word_index = {word: idx for idx, word in enumerate(classic_vocab)}

val_docs = [tf_idf(tf(doc), idfs) for doc in dev_df['description']]

classic_X = np.array([vectorize(doc_tfidf, classic_word_index) for doc_tfidf in classic_dict])
classic_X_val = np.array([vectorize(doc_tfidf, classic_word_index) for doc_tfidf in val_docs])
classic_y = np.array(train_df['label'])
classic_y_val = np.array(dev_df['label'])

## Enhanced Vocabulary

In [9]:
enh_dict = []
for _, row in train_df.iterrows():
    enh_dict.append(tf_idf(tf(row['description']), idf_cat.get(row['category'], {})))
    
enhanced_vocab = sorted(set(word for doc in train_df['description'] for word in tokenize(doc)))
enhanced_word_index = {word: idx for idx, word in enumerate(enhanced_vocab)}

val_docs = []
for doc, cat in zip(dev_df['description'], dev_df['category']):
    val_docs.append(tf_idf(tf(doc), idf_cat.get(cat, {})))

enhanced_X = np.array([vectorize(doc_tfidf, enhanced_word_index) for doc_tfidf in enh_dict])
enhanced_X_val = np.array([vectorize(doc_tfidf, enhanced_word_index) for doc_tfidf in val_docs])
enhanced_y = np.array(train_df['label'])
enhanced_y_val = np.array(dev_df['label'])

### Class weights computation

In [10]:
weights_dict = {
    "cultural agnostic": 0.2995,
    "cultural representative": 0.2700,
    "cultural exclusive": 0.4305
}

def weights_computation(freq_dict):
    weights = {k: 1/v for k, v in freq_dict.items()}
    norm_factor = sum(weights.values())
    norm_weights = {k: v/norm_factor for k, v in weights.items()}
    return norm_weights

class_weights = weights_computation(freq_dict=weights_dict)
print(f"{class_weights}")

{'cultural agnostic': 0.3565110774234955, 'cultural representative': 0.395463213660507, 'cultural exclusive': 0.24802570891599746}


## Test models

### Classic TF-IDF

In [11]:
from itertools import product

C = 10
gamma_values = ['scale', 'auto']
kernels = ['linear', 'rbf', 'sigmoid']

classic_best_score = 0
classic_best_params = {}
classic_best_model = None

for gamma, kernel in product(gamma_values, kernels):
    print(f"Testing: C={C}, gamma={gamma}, kernel={kernel}")
    
    classic_svc = SVC(C=C, gamma=gamma, kernel=kernel, class_weight=class_weights)
    
    try:
        classic_svc.fit(classic_X, classic_y)

        y_pred = classic_svc.predict(classic_X_val)
        score = accuracy_score(classic_y_val, y_pred)

        print(f"Accuracy: {score:.4f}")
        print(f"F1 macro score: {f1_score(classic_y_val, y_pred, average='macro'):.4f}")

        if score > classic_best_score:
            classic_best_score = score
            classic_best_params = {'C': C, 'gamma': gamma, 'kernel': kernel}
            classic_best_model = classic_svc

    except Exception as e:
        print(f"ERROR! Params:\tC={C}, gamma={gamma}, kernel={kernel}: {e}")

Testing: C=10, gamma=scale, kernel=linear
Accuracy: 0.5167
F1 macro score: 0.5146
Testing: C=10, gamma=scale, kernel=rbf
Accuracy: 0.5800
F1 macro score: 0.5702
Testing: C=10, gamma=scale, kernel=sigmoid
Accuracy: 0.5700
F1 macro score: 0.5583
Testing: C=10, gamma=auto, kernel=linear
Accuracy: 0.5167
F1 macro score: 0.5146
Testing: C=10, gamma=auto, kernel=rbf
Accuracy: 0.3567
F1 macro score: 0.1753
Testing: C=10, gamma=auto, kernel=sigmoid
Accuracy: 0.3567
F1 macro score: 0.1753


In [13]:
print("Best parameters found:", classic_best_params)
y_pred = classic_best_model.predict(classic_X_val)
print(classification_report(classic_y_val, y_pred))

Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
                         precision    recall  f1-score   support

      cultural agnostic       0.78      0.70      0.74       117
     cultural exclusive       0.42      0.66      0.51        76
cultural representative       0.56      0.39      0.46       107

               accuracy                           0.58       300
              macro avg       0.59      0.58      0.57       300
           weighted avg       0.61      0.58      0.58       300



### Enhanced TF-IDF

In [21]:
C_vals = [1, 10, 100]
gamma = 'scale'
kernels = ['linear', 'rbf', 'sigmoid']

for C, kernel in product(C_vals, kernels):
    print(f"Testing: C={C}, gamma={gamma}, kernel={kernel}")
    
    enhanced_svc = SVC(C=C, gamma=gamma, kernel=kernel, class_weight=class_weights)
    
    try:
        enhanced_svc.fit(enhanced_X, enhanced_y)

        y_pred = enhanced_svc.predict(enhanced_X_val)
        score = accuracy_score(enhanced_y_val, y_pred)

        print(f"Accuracy: {score:.4f}")
        print(f"F1 macro score: {f1_score(enhanced_y_val, y_pred, average='macro'):.4f}")

        if score > enhanced_best_score:
            enhanced_best_score = score
            enhanced_best_params = {'C': C, 'gamma': gamma, 'kernel': kernel}
            enhanced_best_model = enhanced_svc

    except Exception as e:
        print(f"ERROR! Params:\tC={C}, gamma={gamma}, kernel={kernel}: {e}")

Testing: C=1, gamma=scale, kernel=linear
Accuracy: 0.5933
F1 macro score: 0.5700
Testing: C=1, gamma=scale, kernel=rbf
Accuracy: 0.5833
F1 macro score: 0.5376
Testing: C=1, gamma=scale, kernel=sigmoid
Accuracy: 0.6067
F1 macro score: 0.5719
Testing: C=10, gamma=scale, kernel=linear
Accuracy: 0.5800
F1 macro score: 0.5749
Testing: C=10, gamma=scale, kernel=rbf
Accuracy: 0.6133
F1 macro score: 0.5879
Testing: C=10, gamma=scale, kernel=sigmoid
Accuracy: 0.5733
F1 macro score: 0.5530
Testing: C=100, gamma=scale, kernel=linear
Accuracy: 0.5233
F1 macro score: 0.5227
Testing: C=100, gamma=scale, kernel=rbf
Accuracy: 0.5833
F1 macro score: 0.5612
Testing: C=100, gamma=scale, kernel=sigmoid
Accuracy: 0.4867
F1 macro score: 0.4821


In [22]:
print("Best parameters found:", enhanced_best_params)
y_pred = enhanced_best_model.predict(enhanced_X_val)
print(classification_report(enhanced_y_val, y_pred))

Best parameters found: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
                         precision    recall  f1-score   support

      cultural agnostic       0.72      0.84      0.77       117
     cultural exclusive       0.46      0.50      0.48        76
cultural representative       0.59      0.45      0.51       107

               accuracy                           0.61       300
              macro avg       0.59      0.60      0.59       300
           weighted avg       0.61      0.61      0.61       300

