## Subcategory SVM

In [12]:
from  sklearn import svm as sksvm
import pandas as pd
import numpy as np
import datasets
import sklearn
import joblib
import dotenv
import os


In [13]:
dotenv.load_dotenv()
HF_TOKEN = os.environ['HF_TOKEN']
SEED = 42
TRAINING = False
datasetTotal = datasets.load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset', token=HF_TOKEN)

In [14]:
valSet = pd.DataFrame(datasetTotal['validation'])[['subcategory', 'label', 'category','type']]
trainSet = pd.DataFrame(datasetTotal['train'])[['subcategory', 'label', 'category', 'type']]

In [15]:
subcatTrain = set(trainSet['subcategory'].unique())
subcatVal = set(valSet['subcategory'].unique())

subcat = subcatTrain.union(subcatVal)

aliasesSub = {k:i for i,k in enumerate(subcat)}
print(aliasesSub)

catTrain = set(trainSet['category'].unique())
catVal = set(valSet['category'].unique())

cat = catTrain.union(catVal)

aliasesCat = {k:i for i,k in enumerate(cat)}
print(aliasesCat)

typeTrain = set(trainSet['type'].unique())
typeVal = set(valSet['type'].unique())

typeTot = typeTrain.union(typeVal)
aliasesType = {k:i for i,k in enumerate(typeTot)}

aliasLabel = {
    'cultural exclusive':0,
    'cultural agnostic':1,
    'cultural representative':2
}


{'archive': 0, 'fish': 1, 'folk dance': 2, 'literary genre': 3, 'religious book': 4, 'religious leader': 5, 'sports equipment': 6, 'publisher': 7, 'biologist': 8, 'production company': 9, 'musician': 10, 'musical profession': 11, 'visual arts': 12, 'food': 13, 'political party': 14, 'plant': 15, 'construction': 16, 'religious movement': 17, 'environment': 18, 'drink': 19, 'mores': 20, 'mode of transport': 21, 'manga': 22, 'athlete': 23, 'poetry': 24, 'mountain': 25, 'music genre': 26, 'traditional costume': 27, 'bookstore': 28, 'ritual': 29, 'body language': 30, 'philosophy': 31, 'designer': 32, 'poet': 33, 'philosopher': 34, 'animation studio': 35, 'recurring sporting event': 36, 'painting': 37, 'comics': 38, 'fashion trend': 39, 'museum': 40, 'writer': 41, 'architect': 42, 'musical group': 43, 'tradition': 44, 'building': 45, 'architectural structure': 46, 'film studio': 47, 'cooking technique': 48, 'theatrical genre': 49, 'gesture': 50, 'neighborhood': 51, 'media company': 52, 'clot

In [16]:
Ytrain = trainSet['label'].apply(lambda x: aliasLabel[x])
Xtrain1 = trainSet['subcategory'].apply(lambda x: aliasesSub[x])
Xtrain2 = trainSet['category'].apply(lambda x: aliasesCat[x])
Xtrain3 = trainSet['type'].apply(lambda x: aliasesType[x])
Xtrain = pd.DataFrame({'subcategory':Xtrain1, 'category':Xtrain2, 'type': Xtrain3})

In [17]:
Yval = valSet['label'].apply(lambda x: aliasLabel[x])
Xval1 = valSet['subcategory'].apply(lambda x: aliasesSub[x])
Xval2 = valSet['category'].apply(lambda x: aliasesCat[x])
Xval3 = valSet['type'].apply(lambda x: aliasesType[x])
Xval = pd.DataFrame({'subcategory':Xval1, 'category':Xval2, 'type': Xval3})

In [18]:
Xval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   subcategory  300 non-null    int64
 1   category     300 non-null    int64
 2   type         300 non-null    int64
dtypes: int64(3)
memory usage: 7.2 KB


In [None]:
def trainPass(scale, gamma, kernel, features, labels, validation_features, validation_labels):
    modelSubcategory = sksvm.SVC(kernel=kernel, 
                                C=scale, 
                                gamma=gamma,
                                degree=2 if kernel == 'poly' else 0,
                                class_weight={
                                        2:0.2700,
                                        1:0.2994,
                                        0:0.4305
                                    }, 
                                probability=True, 
                                random_state=SEED,
                                max_iter=10000)
    modelSubcategory.fit(features, labels)
    accuracy_score = modelSubcategory.score(validation_features, validation_labels)
    return accuracy_score, modelSubcategory

In [None]:
if TRAINING:
    bestScore = 0
    bestModel = None
    for k in ['rbf']:
        for s in np.linspace(1,1.5,10):
            for c in np.linspace(10,10,1):
                score, model =  trainPass(c,s,k,Xtrain,Ytrain,Xval,Yval)
                if score > bestScore:
                    bestModel = model
                    bestScore = score
                print(f'Kernel: {k}, C: {c:.4f}, Gamma: {s:.4f} -> {score}')
    print(f"Accurecy: {bestScore*100:.4f}")
    print(bestModel)            

Best model: kernel='rbf', C=10, gamma=1

In [21]:
if TRAINING:
    joblib.dump(bestModel, 'categorySVCWeights.pkl')

In [49]:
if not TRAINING:
    svm = joblib.load('categorySVCWeights.pkl')
    
results = svm.predict_proba(Xval)
confidence = results[np.arange(results.shape[0]), np.argmax(results, axis=1)]*100

print(f'Max: {confidence.max():.4f}%')
print(f'Min: {confidence.min():.4f}%')
print(f'Mean: {confidence.mean():.4f}%')
print(f'Std: {confidence.std():.4f}%')

Max: 71.4936%
Min: 34.6141%
Mean: 41.0117%
Std: 6.2554%
