In [1]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the FastText model
fasttext.util.download_model('en', if_exists='ignore')  # Download English model
ft = fasttext.load_model('cc.en.300.bin')  # Load the model

# Load and shuffle the dataset
df = pd.read_csv('datasets/cleaned_OLID.tsv', sep="\t")
df = df.sample(frac=1, random_state=42)

# df_test = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep="\t")
df_test = pd.read_csv('datasets/cleaned_hatespeech_offensive.tsv', sep="\t")
df_test = df_test.sample(frac=1, random_state=42)

tweets = df['tweet'].values
labels = df['label'].values

test_tweets = df_test['tweet'].values
test_labels = df_test['label'].values

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# Tokenize and encode the training and validation texts using FastText
def embed_text(text, model):
    words = text.split()
    word_vectors = [model.get_word_vector(word) for word in words]
    return np.mean(word_vectors, axis=0)  # Average word vectors

train_encodings = np.array([embed_text(text, ft) for text in train_texts])
val_encodings = np.array([embed_text(text, ft) for text in val_texts])
test_encodings = np.array([embed_text(text, ft) for text in test_tweets])

# Check the vocabulary size (if needed)
vocab_size = len(ft.words)

del ft



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item = {'embedding': torch.tensor(self.encodings[idx])}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def collate_fn(batch):
    embeddings = [item['embedding'] for item in batch]
    labels = torch.tensor([item['labels'] for item in batch])
    lengths = torch.tensor([len(embedding) for embedding in embeddings])
    padded_embeddings = pad_sequence(embeddings, batch_first=True, padding_value=0.0)
    return {'embedding': padded_embeddings, 'labels': labels, 'lengths': lengths}

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False, collate_fn=collate_fn)
# test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False, collate_fn=collate_fn)


In [3]:
test_encodings.shape

(24783, 300)

In [4]:
from sklearn import svm

model = svm.SVC()


model.fit(train_dataset.encodings, train_dataset.labels)

In [5]:
prediction_list = model.predict(val_dataset.encodings)

In [6]:
print(model)

SVC()


In [7]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(val_labels, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.74      0.96      0.84      1764
           1       0.82      0.32      0.46       884

    accuracy                           0.75      2648
   macro avg       0.78      0.64      0.65      2648
weighted avg       0.77      0.75      0.71      2648



In [8]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(val_labels, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 287
True Negatives (TN): 1700
False Positives (FP): 64
False Negatives (FN): 597


In [9]:
prediction_list = model.predict(test_dataset.encodings)


In [10]:
len(test_dataset)

24783

In [11]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(test_labels, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.41      0.87      0.56      4163
           1       0.97      0.75      0.84     20620

    accuracy                           0.77     24783
   macro avg       0.69      0.81      0.70     24783
weighted avg       0.87      0.77      0.79     24783



In [12]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(test_labels, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 15410
True Negatives (TN): 3614
False Positives (FP): 549
False Negatives (FN): 5210


In [13]:
# import pandas as pd

# # Create a DataFrame with the validation texts and labels
# validation_df = pd.DataFrame({'text': val_texts, 'label': val_labels})

# # Add the prediction list as a new column to the DataFrame
# validation_df['prediction'] = prediction_list

# # Save the DataFrame as a CSV file
# validation_df.to_csv('OLID_validation_with_predictions.csv', index=False)
