In [1]:
import torch
from transformers import pipeline
from sklearn import svm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = pd.read_csv('datasets/train_level_a.csv')
train_dataset

Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans wh...,OFF
1,90194,@USER @USER Go home youâ€™re drunk!!! @USER #MAG...,OFF
2,16820,Amazon is investigating Chinese employees who ...,NOT
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT
...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,OFF
13236,67210,Benidorm âœ… Creamfields âœ… Maga âœ… Not too sh...,NOT
13237,82921,@USER And why report this garbage. We don't g...,OFF
13238,27429,@USER Pussy,OFF


In [8]:
checkpoint = "facebook/bart-base"
feature_extractor = pipeline("feature-extraction", framework="pt", model=checkpoint)
features = [feature_extractor(padded_text, return_tensors="pt")[0].squeeze(0) for padded_text in tqdm(train_dataset["tweet"].tolist())]
padded_features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True, padding_value=0)
padded_features.shape

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:11<00:00,  8.76it/s]


torch.Size([100, 127, 768])

In [9]:
X_train = padded_features.numpy()
X_train = X_train.reshape(X_train.shape[0], -1)
le = LabelEncoder()
y_train = np.array(le.fit_transform(train_dataset["subtask_a"].tolist()))
print(dict(zip(le.classes_, le.transform(le.classes_))))
print(X_train.shape, y_train.shape)

{'NOT': 0, 'OFF': 1}
(100, 97536) (100,)


In [10]:
batch_size=1000
classifier = svm.SVC(kernel='linear')
for X_batch, y_batch in zip(np.array_split(X_train, len(X_train) // batch_size + 1), np.array_split(y_train, len(y_train) // batch_size + 1)):
    classifier.fit(X_batch, y_batch)

In [28]:
test_dataset = pd.read_csv('datasets/test_level_a.csv')
test_dataset

Unnamed: 0,id,tweet,label
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF
1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT
3,13876,#Watching #Boomer getting the news that she is...,NOT
4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF
...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,OFF
856,25657,#MeetTheSpeakers ðŸ™Œ @USER will present in our e...,NOT
857,67018,3 people just unfollowed me for talking about ...,OFF
858,50665,#WednesdayWisdom Antifa calls the right fascis...,NOT


In [48]:
test_features = [feature_extractor(text, return_tensors="pt")[0].squeeze(0) for text in tqdm(test_dataset["tweet"].tolist())]
test_features = [feature[:padded_features.shape[1], :] if feature.shape[0]>padded_features.shape[1] else feature for feature in test_features]
test_features = [torch.nn.functional.pad(feature, pad=(0, padded_features.shape[2] - feature.shape[1], 0, padded_features.shape[1] - feature.shape[0]), mode="constant", value=0) for feature in test_features]
X_test = torch.stack(test_features).numpy()
X_test = X_test.reshape(X_test.shape[0], -1)
Y_test = np.array(le.fit_transform(test_dataset["label"].tolist()))
print(X_train.shape, Y_test.shape)
Y_predicted = classifier.predict(X_test)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:01<00:00,  6.52it/s]

(10, 32256) (10,)





(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0], dtype=int64))

In [51]:
print(accuracy_score(Y_test, Y_predicted))
print(classification_report(Y_test, Y_predicted, target_names=le.classes_, zero_division=1))

0.4
              precision    recall  f1-score   support

         NOT       1.00      0.00      0.00         6
         OFF       0.40      1.00      0.57         4

    accuracy                           0.40        10
   macro avg       0.70      0.50      0.29        10
weighted avg       0.76      0.40      0.23        10

