## Subcategory SVM

In [1]:
from  sklearn import svm as sksvm
import pandas as pd
import numpy as np
import datasets
import sklearn
import joblib
import dotenv
import os


In [2]:
dotenv.load_dotenv()
HF_TOKEN = os.environ['HF_TOKEN']
SEED = 42
TRAINING = True
datasetTotal = datasets.load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset', token=HF_TOKEN)

In [31]:
valSet = pd.DataFrame(datasetTotal['validation'])[['subcategory', 'label', 'category','type']]
trainSet = pd.DataFrame(datasetTotal['train'])[['subcategory', 'label', 'category', 'type']]

In [46]:
subcatTrain = set(trainSet['subcategory'])
subcatVal = set(valSet['subcategory'])

subcat = subcatTrain.union(subcatVal)
print(len(subcat))
aliasesSub = {k:i for i,k in enumerate(subcat)}
print(aliasesSub)

catTrain = set(trainSet['category'])
catVal = set(valSet['category'])

cat = catTrain.union(catVal)
print(len(cat))

aliasesCat = {k:i for i,k in enumerate(cat)}
print(aliasesCat)

typeTrain = set(trainSet['type'])
typeVal = set(valSet['type'])

typeTot = typeTrain.union(typeVal)
aliasesType = {k:i for i,k in enumerate(typeTot)}
print(len(typeTot))
aliasLabel = {
    'cultural exclusive':0,
    'cultural agnostic':1,
    'cultural representative':2
}


112
{'book': 0, 'cooking technique': 1, 'river': 2, 'religion': 3, 'archive': 4, 'comics artist': 5, 'station': 6, 'art gallery': 7, 'television': 8, 'environment': 9, 'sports equipment': 10, 'choreographer': 11, 'dance': 12, 'mountain': 13, 'comics': 14, 'city': 15, 'biologist': 16, 'theatrical genre': 17, 'photographer': 18, 'cook': 19, 'construction': 20, 'museum': 21, 'painting': 22, 'clothing': 23, 'happening': 24, 'film producer': 25, 'tradition': 26, 'traditional costume': 27, 'building material': 28, 'automobile manufacturer': 29, 'food': 30, 'model': 31, 'ingredient': 32, 'film genre': 33, 'non-fiction writer': 34, 'folk dance': 35, 'record label': 36, 'historical event': 37, 'artist': 38, 'writing style': 39, 'media company': 40, 'athlete': 41, 'architect': 42, 'transport company': 43, 'government': 44, 'transport': 45, 'film': 46, 'animal': 47, 'tree': 48, 'philosopher': 49, 'poet': 50, 'sports team': 51, 'film studio': 52, 'mores': 53, 'magazine': 54, 'neighborhood': 55, 'g

In [58]:
subCategoryEncodingTrain = np.zeros((trainSet.shape[0], len(subcat)), dtype=int)
categoryEncodingTrain = np.zeros((trainSet.shape[0], len(cat)), dtype=int)
typeEncodingTrain = np.zeros((trainSet.shape[0], len(typeTot)), dtype=int)

finalTrainLabel = trainSet['label'].apply(lambda x: aliasLabel[x])

for i in range(trainSet.shape[0]):
    subCategoryEncodingTrain[i][aliasesSub[trainSet['subcategory'].iloc[i]]] = 1
    categoryEncodingTrain[i][aliasesCat[trainSet['category'].iloc[i]]] = 1
    typeEncodingTrain[i][aliasesType[trainSet['type'].iloc[i]]] = 1
    
finalTrainData = np.concatenate((subCategoryEncodingTrain, categoryEncodingTrain, typeEncodingTrain), axis=1)
print(finalTrainData.shape)

(6251, 133)


In [59]:
subCategoryEncodingVal = np.zeros((valSet.shape[0], len(subcat)), dtype=int)
categoryEncodingVal = np.zeros((valSet.shape[0], len(cat)), dtype=int)
typeEncodingVal = np.zeros((valSet.shape[0], len(typeTot)), dtype=int)

finalValLabel = valSet['label'].apply(lambda x: aliasLabel[x])

for i in range(valSet.shape[0]):
    subCategoryEncodingVal[i][aliasesSub[valSet['subcategory'].iloc[i]]] = 1
    categoryEncodingVal[i][aliasesCat[valSet['category'].iloc[i]]] = 1
    typeEncodingVal[i][aliasesType[valSet['type'].iloc[i]]] = 1
    
finalValData = np.concatenate((subCategoryEncodingVal, categoryEncodingVal, typeEncodingVal), axis=1)
print(finalValData.shape)

(300, 133)


In [60]:
def trainPass(scale, gamma, kernel, features, labels, validation_features, validation_labels):
    modelSubcategory = sksvm.SVC(kernel=kernel, 
                                C=scale, 
                                gamma=gamma,
                                class_weight={
                                        2:0.2700,
                                        1:0.2994,
                                        0:0.4305
                                    }, 
                                probability=True, 
                                random_state=SEED,
                                max_iter=10000)
    modelSubcategory.fit(features, labels)
    accuracy_score = modelSubcategory.score(validation_features, validation_labels)
    return accuracy_score, modelSubcategory

In [70]:
if TRAINING:
    bestScore = 0
    bestModel = None
    for k in ['rbf', 'sigmoid']:
        for s in np.linspace(1,1,1):
            for c in np.linspace(10,10,1):
                scoreValidation, model =  trainPass(c,s,k,
                                          finalTrainData,
                                          finalTrainLabel,
                                          finalValData,
                                          finalValLabel)
                if scoreValidation > bestScore:
                    bestModel = model
                    bestScore = scoreValidation
                print(f'Kernel: {k}, C: {c:.4f}, Gamma: {s:.4f} -> {scoreValidation}')
    print(f"Accurecy: {bestScore*100:.4f}")
    print(bestModel)            

Kernel: rbf, C: 10.0000, Gamma: 1.0000 -> 0.5933333333333334
Kernel: sigmoid, C: 10.0000, Gamma: 1.0000 -> 0.4633333333333333
Accurecy: 59.3333
SVC(C=10.0, class_weight={0: 0.4305, 1: 0.2994, 2: 0.27}, gamma=1.0,
    max_iter=10000, probability=True, random_state=42)


Best model: kernel='rbf', C=10, gamma=1

In [71]:
if TRAINING:
    joblib.dump(bestModel, 'categorySVCWeights.pkl')

In [72]:
if not TRAINING:
    svm = joblib.load('categorySVCWeights.pkl')
else:
    svm = bestModel
    
results = svm.predict_proba(finalValData)
confidence = results[np.arange(results.shape[0]), np.argmax(results, axis=1)]*100

print(f'Max confidence: {confidence.max():.4f}%')
print(f'Min confidence: {confidence.min():.4f}%')
print(f'Mean confidence: {confidence.mean():.4f}%')
print(f'Std confidence: {confidence.std():.4f}%')

Max confidence: 71.5174%
Min confidence: 38.2382%
Mean confidence: 64.8531%
Std confidence: 6.3313%
