# SVM baseline for MetaHate

In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

from tqdm import tqdm

## Reading the data

In [15]:
data = pd.read_csv('../../../data/processed/within-dataset/metahate_train.tsv', sep='\t', names=['label', 'text'])

data = data.dropna(subset=['text'])

data['text'] = data['text'].astype(str)

texts_train = data['text'].tolist()
labels_train = data['label'].tolist()

data = pd.read_csv('../../../data/processed/within-dataset/metahate_test.tsv', sep='\t', names=['label', 'text'])

data = data.dropna(subset=['text'])

data['text'] = data['text'].astype(str)

texts_test = data['text'].tolist()
labels_test = data['label'].tolist()

  data = pd.read_csv('../../../data/processed/within-dataset/metahate_train.tsv', sep='\t', names=['label', 'text'])


## Creating a TF-IDF vectorizer for text data

In [16]:
vectorizer = TfidfVectorizer(
    max_features=10000000,     # Maximum number of features to consider
    ngram_range=(1, 1),        # Considering unigrams (single words)
    stop_words='english',      # Ignoring common English stop words
    sublinear_tf=True,         # Applying sublinear scaling to term frequency
    use_idf=True               # Using Inverse Document Frequency (IDF)
)

## Fitting and transforming the training and testing data

In [17]:
X_train_vectorized = vectorizer.fit_transform(tqdm(texts_train, desc='Fitting and transforming training data'))
X_test_vectorized = vectorizer.transform(tqdm(texts_test, desc='Transforming testing data'))

Fitting and transforming training data:   0%|          | 0/880932 [00:00<?, ?it/s]

Fitting and transforming training data: 100%|██████████| 880932/880932 [01:17<00:00, 11354.14it/s]
Transforming testing data: 100%|██████████| 220234/220234 [00:22<00:00, 9752.83it/s] 


## Creating and training the SVM model

In [18]:
svm_model = LinearSVC(random_state=0, tol=1e-5)
svm_model.fit(X_train_vectorized, labels_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,1e-05
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


## Making predictions on the test set

In [19]:
predictions = svm_model.predict(X_test_vectorized)

## Calculating the metrics

In [20]:
accuracy = accuracy_score(labels_test, predictions)
report = classification_report(labels_test, predictions)
weighted_f1 = f1_score(labels_test, predictions, average='weighted')
micro_f1 = f1_score(labels_test, predictions, average='micro')
macro_f1 = f1_score(labels_test, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print(f"Weighted F1 Score: {weighted_f1}")
print(f"Micro F1 Score: {micro_f1}")
print(f"Macro F1 Score: {macro_f1}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.8656565289646467
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92    173537
           1       0.73      0.59      0.65     46696
       label       0.00      0.00      0.00         1

    accuracy                           0.87    220234
   macro avg       0.54      0.51      0.52    220234
weighted avg       0.86      0.87      0.86    220234

Weighted F1 Score: 0.8601903455941442
Micro F1 Score: 0.8656565289646467
Macro F1 Score: 0.522112514113509
