In [1]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the FastText model
fasttext.util.download_model('en', if_exists='ignore')  # Download English model
ft = fasttext.load_model('cc.en.300.bin')  # Load the model

# Load and shuffle the dataset
df = pd.read_csv('datasets/cleaned_OLID.tsv', sep="\t")
df = df.sample(frac=1, random_state=42)

df_test = pd.read_csv('outputs/wr-SOLID_BERT_colearning_4-SOLID_test.tsv', sep="\t")

df_solid_test = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep='\t')

tweets = df['tweet'].values
labels = df['label'].values

test_tweets = df_test['tweet'].values
test_labels = df_test['label'].values
test_prev_labels = df_test['prediction'].values

solid_test_tweets = df_solid_test['tweet'].values
solid_test_labels = df_solid_test['label'].values

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# Tokenize and encode the training and validation texts using FastText
def embed_text(text, model):
    words = text.split()
    word_vectors = [model.get_word_vector(word) for word in words]
    return np.mean(word_vectors, axis=0)  # Average word vectors

train_encodings = np.array([embed_text(text, ft) for text in train_texts])
val_encodings = np.array([embed_text(text, ft) for text in val_texts])
test_encodings = np.array([embed_text(text, ft) for text in test_tweets])
solid_test_encodings = np.array([embed_text(text, ft) for text in solid_test_tweets])

# Check the vocabulary size (if needed)
vocab_size = len(ft.words)

del ft



In [2]:
from sklearn.ensemble import RandomForestClassifier 

clf = RandomForestClassifier(n_estimators=100, random_state=42)

clf.fit(train_encodings, train_labels)

In [3]:
from sklearn.metrics import accuracy_score
prediction_list = clf.predict(val_encodings)
accuracy = accuracy_score(val_labels, prediction_list)

In [4]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(val_labels, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.71      0.97      0.82      1764
           1       0.77      0.20      0.32       884

    accuracy                           0.71      2648
   macro avg       0.74      0.59      0.57      2648
weighted avg       0.73      0.71      0.65      2648



In [5]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(val_labels, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 177
True Negatives (TN): 1711
False Positives (FP): 53
False Negatives (FN): 707


In [6]:
prediction_list = clf.predict(solid_test_encodings)

accuracy = accuracy_score(solid_test_labels, prediction_list)

report = classification_report(solid_test_labels, prediction_list)

print(report)

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(solid_test_labels, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.64      0.97      0.77      2991
           1       0.93      0.46      0.62      3002

    accuracy                           0.71      5993
   macro avg       0.79      0.71      0.69      5993
weighted avg       0.79      0.71      0.69      5993

True Positives (TP): 1385
True Negatives (TN): 2887
False Positives (FP): 104
False Negatives (FN): 1617


In [7]:
prediction_test = clf.predict(test_encodings)

report_test_prev = classification_report(test_labels, test_prev_labels)

print(report_test_prev)

report_test = classification_report(test_labels, prediction_test)

print(report_test)


# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(test_labels, prediction_test)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(test_labels, test_prev_labels)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     276.0
           1       0.00      0.00      0.00     202.0

    accuracy                           0.00     478.0
   macro avg       0.00      0.00      0.00     478.0
weighted avg       0.00      0.00      0.00     478.0

              precision    recall  f1-score   support

           0       0.52      0.71      0.60       276
           1       0.22      0.11      0.15       202

    accuracy                           0.46       478
   macro avg       0.37      0.41      0.38       478
weighted avg       0.40      0.46      0.41       478

True Positives (TP): 23
True Negatives (TN): 196
False Positives (FP): 80
False Negatives (FN): 179
True Positives (TP): 0
True Negatives (TN): 0
False Positives (FP): 276
False Negatives (FN): 202
