In [1]:
import spacy
import numpy as np
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load("en_core_web_sm") 
nlp.add_pipe('spacytextblob')
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.svm import SVC 
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression

In [2]:
STOP_WORDS = {
    'i', 'me', 'my', 'myself', 'it', 'its', 'itself', 'what', 'which', 'who', 'whom', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
    'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
    'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only',
    'own', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
}

hate_speech_words = [
    'nigger', 'nigga', 'coon', 'spic', 'chink', 'kike', 'gook', 'wop', 'towelhead', 'slope',
    'kafir', 'infidel', 'heathen', 'idolater', 'heretic', 'blasphemer',
    'bitch', 'slut', 'whore', 'cunt', 'feminazi', 'tranny', 'dyke', 'faggot', 'fag', 'pussy', 'mansplainer',
    'faggot', 'fag', 'dyke', 'homo', 'queer', 'sissy', 'tranny', 'butch',
    'tranny', 'shemale', 'it', 'transvestite', 'gender bender', 'freak',
    'retard', 'spaz', 'cripple', 'invalid', 'lame', 'imbecile', 'moron', 'idiot',
    'fucktard', 'asshat', 'retard', 'douchebag', 'assclown', 'twat', 'twatwaffle', 'dickwad'
]

swear_words = [
    'fuck', 'fucking', 'fucked', 'fuckwit', 'fucktard', 'fuckface', 'fuckhead', 'motherfucker', 'shit', 'shitty', 'shite', 'shitting', 
    'shithead', 'shitstorm', 'ass', 'asshole', 'asshat', 'asswipe', 'dumbass', 'bitch', 'bitchy', 'son of a bitch',
    'bastard', 'damn', 'damned', 'dick', 'dickhead', 'dickwad', 'pussy', 'pussies', 'cock', 'cocksucker', 'cockhead',
    'cunt', 'cunty', 'cuntface', 'twat', 'twatwaffle', 'wanker', 'wank', 'wankstain', 'bollocks', 'douchebag', 'douche', 'douchecanoe',
    'arse', 'arsehole', 'arsehat', 'arsewipe', 'shitfuck', 'fuckshit', 'clusterfuck', 'fucknugget', 'fuckery',
    'cockwomble', 'cockgobbler', 'cockjockey', 'dickcheese', 'dicknose', 'shitlord', 'assclown', 'assjacket'
]



In [3]:
def preprocessing(sentences):
      rets=[]
      polarity=[]
      subjectivity=[]
      hate_count=[]
      for sent in sentences:
            sent = sent.replace("n't", " not")
            sent = sent.lower().strip()
            temp_arr= sent.split(" ")
            count=0
            for word in temp_arr:
                  if(word in hate_speech_words or word in swear_words):
                        count+=1
            hate_count.append(count/len(temp_arr))
            doc = nlp(sent)
            arr = [token.lemma_ for token in doc if (token.lemma_.lower() not in STOP_WORDS) and not token.is_punct and not token.like_num 
                  and not token.is_currency and not token.is_digit]
            arr= " ".join(arr)
            rets.append(arr)
            polarity.append(doc._.blob.polarity)
            subjectivity.append(doc._.blob.subjectivity)
      return rets, polarity, subjectivity, hate_count

In [4]:
#using caption generating model- BLIP, and using the provided jsonl files, the following table has been created
#it contains caption of image, description of the image generated by BLIP
caption_description_df= pd.read_pickle("../commons/caption_description.pkl")
caption_description_df.head(5)

Unnamed: 0,label,text,file,description,sets
0,0,berserk 2016 is a good adaptation you're kidd...,71094.png,woman with a monkey mask and a fake monkey,[test_unseen]
1,0,my life goal? make somebody this fucking trig...,91724.png,woman holding a cigarette in her hand,[train]
2,0,""" i don't wanna, just get it, get it, get it, ...",64280.png,man wearing a hat and a tie,[train]
3,0,"""1st day of 4th grade sandy hook elementary sc...",67082.png,group of children standing in front of a schoo...,"[dev_seen, dev_unseen]"
4,0,"""a blow job a day will keep his side chicks aw...",46380.png,woman with a black top and a blue background,[train]


In [5]:
# #Uncomment if necessary

# caption_df= caption_description_df[["label", "text", "file", "sets"]]
# rets, polarity, subjectivity, hate_count= preprocessing(caption_df["text"])
# caption_df["preprocessed text"]=rets
# caption_df["polarity"]=polarity
# caption_df["subjectivity"]=subjectivity
# caption_df["hate count"]= hate_count
# pd.to_pickle(caption_df, "../commons/caption_preprocessed.pkl")

caption_df= pd.read_pickle("../commons/caption_preprocessed.pkl")
caption_df.head()

Unnamed: 0,label,text,file,sets,preprocessed text,polarity,subjectivity,hate count
0,0,berserk 2016 is a good adaptation you're kidd...,71094.png,[test_unseen],berserk good adaptation you kid right,0.492857,0.567857,0.0
1,0,my life goal? make somebody this fucking trig...,91724.png,[train],life goal make somebody this fucking trigger,-0.6,0.8,0.125
2,0,""" i don't wanna, just get it, get it, get it, ...",64280.png,[train],not wanna get get get get that shit hard chanc...,-0.163889,0.480556,0.08
3,0,"""1st day of 4th grade sandy hook elementary sc...",67082.png,"[dev_seen, dev_unseen]",day grade sandy hook elementary school,0.3,0.9,0.0
4,0,"""a blow job a day will keep his side chicks aw...",46380.png,[train],blow job day keep his side chick away -sasha grey,-0.05,0.1,0.0


In [6]:
train_df= caption_df[["train" in val for val in caption_df["sets"]]]
dev_seen_df= caption_df[["dev_seen" in val for val in caption_df["sets"]]]
test_seen_df= caption_df[["test_seen" in val for val in caption_df["sets"]]]

In [7]:
# Create a Vectorizer Object
def vectorizer(dataframe, column, vocab=None):
    document= list(dataframe[column])
    if(vocab==None):
        vectorizer = CountVectorizer()
    else:
        vectorizer = CountVectorizer(vocabulary=vocab)
    vectorizer.fit(document)
    vocab= vectorizer.vocabulary_
    vector = vectorizer.transform(document)
    return vector, vocab

def make_numpy_matrix(vector, dataframe):
    vectorised= vector.toarray()
    tempdf=[]
    for i in range(len(dataframe)):
        temp= np.append(vectorised[i], [dataframe.iloc[i]["polarity"], dataframe.iloc[i]["subjectivity"], dataframe.iloc[i]["hate count"], 
                                        dataframe.iloc[i]["label"]])
        tempdf.append(temp)
    tempdf= np.array(tempdf)
    return tempdf[:,:-1], tempdf[:,-1]

def print_metrics(y_pred2, y_dev):
    print(f"accuracy: {metrics.accuracy_score(y_pred2, y_dev)}")
    print(f"f1 score: {metrics.f1_score(y_pred2, y_dev)}")
    print(f"AUROC: {metrics.roc_auc_score(y_pred2, y_dev)}")
    print(f"Recall: {metrics.recall_score(y_pred2, y_dev)}") 
    print(f"Precision: {metrics.precision_score(y_pred2, y_dev)}")

In [8]:
vec, voc = vectorizer(train_df, "text")
vec_dev, _ = vectorizer(dev_seen_df, "text", voc)
vec_test, _ = vectorizer(test_seen_df, "text", voc)

X_train, y_train = make_numpy_matrix(vec, train_df)
X_test, y_test = make_numpy_matrix(vec_test, test_seen_df)
X_dev, y_dev = make_numpy_matrix(vec_dev, dev_seen_df)

All the data below is on the seen test set alone:

| Model               | accuracy_score | f1_score | roc_auc_score | recall_score | precision_score |
|---------------------|----------------|----------|---------------|--------------|-----------------|
| SVM rbf             | 0.549          | 0.259    | 0.598         | 0.664        | 0.161           |
| SVM linear          | 0.560          | 0.382    | 0.579         | 0.613        | 0.278           |
| SVM poly deg 5      | 0.513          | 0.072    | 0.527         | 0.543        | 0.039           |
| Random Forest n=1000| 0.557          | 0.287    | 0.609         | 0.679        | 0.182           |
| Complement Naive Bayes |       0.582 | 0.517    |  0.585        | 0.596        |            0.457|


No matter if we choose less of features or change hyperparameters, it is still seen that model metrics mainly precision, does not improve, indicating something is not correct with underlying data itself. To be noted though, that we have tried out SVM (and Random Forests) so far, which tends to perform poorly on imbalanced datasets. 
We also observe, that Complement Naive Bayes tends to give the most balanced results in terms of precision and recall, both being close to 0.5 because complement Naive Bayes is designed to deal with imbalanced datasets. This might be an indication that other models were performing poorly because the imbalance in the dataset- 35% being offensive. We shall now try to perform oversampling and undersampling to see if performance can be improved. 

In [9]:
#Note that here we only take the words into account because by definiton, naive bayes models are designed to consider word collections. 
# Having additional features would not logically make sense. 
mnb = ComplementNB()
mnb.fit(X_train[:,:-3], y_train)
print("Test Seen")
y_pred= mnb.predict(X_test[:,:-3])
print_metrics(y_pred, y_test)

print("\nDev Seen")
y_pred2= mnb.predict(X_dev[:,:-3])
print_metrics(y_pred2, y_dev)

Test Seen
accuracy: 0.582
f1 score: 0.5173210161662818
AUROC: 0.5847313147845062
Recall: 0.5957446808510638
Precision: 0.45714285714285713

Dev Seen
accuracy: 0.56
f1 score: 0.5175438596491229
AUROC: 0.5606471661816209
Recall: 0.5645933014354066
Precision: 0.4777327935222672


In [10]:
#We see a very low value of precision in an unbalanced dataset
mnb = MultinomialNB()
mnb.fit(X_train[:,:-3], y_train)
print("Test Seen")
y_pred= mnb.predict(X_test[:,:-3])
print_metrics(y_pred, y_test)

print("\nDev Seen")
y_pred2= mnb.predict(X_dev[:,:-3])
print_metrics(y_pred2, y_dev)

Test Seen
accuracy: 0.578
f1 score: 0.44327176781002636
AUROC: 0.5934874806296387
Recall: 0.6268656716417911
Precision: 0.34285714285714286

Dev Seen
accuracy: 0.564
f1 score: 0.4381443298969072
AUROC: 0.5757916987692367
Recall: 0.6028368794326241
Precision: 0.3441295546558704


The following three cells are commented because they take around 10 minutes to run each, uncomment if necessary, but the findings have been reported above

In [11]:
# svm_classifier = SVC()
# svm_classifier.fit(X_train, y_train)
# y_pred= svm_classifier.predict(X_test)
# print_metrics(y_pred, y_test)

In [12]:
# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# importances = rf.feature_importances_
# importances
# y_pred= rf.predict(X_test)
# print_metrics(y_pred, y_test)

In [13]:
# selfr=2000
# indices = np.argsort(importances)[::-1]
# selected_features = X_train[:, indices[:selfr]]
# rf2 = RandomForestClassifier(n_estimators=1000)
# rf2.fit(selected_features, y_train)
# temp2= X_test[:, indices[:selfr]]
# y_pred2= rf2.predict(temp2)
# print_metrics(y_pred2, y_test)

We shall now undersample the majority class, ie the non hateful memes, and then check our results

In [14]:
class_0_data = train_df[train_df['label'] == 0]
class_1_data = train_df[train_df['label'] == 1]
undersampled_class_0 = class_0_data.sample(n=3019, random_state=42) #there are 3019 label 1 rows
balanced_df = pd.concat([undersampled_class_0, class_1_data])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
vec2, voc2 = vectorizer(balanced_df, "text")
vec_dev2, _ = vectorizer(dev_seen_df, "text", voc2)
vec_test2, _ = vectorizer(test_seen_df, "text", voc2)

X_train2, y_train2 = make_numpy_matrix(vec2, balanced_df)
X_test2, y_test2 = make_numpy_matrix(vec_test2, test_seen_df)
X_dev2, y_dev2 = make_numpy_matrix(vec_dev2, dev_seen_df)

In [16]:
#We dont see a very significant difference from before because Complement Naive Bayes has been designed for imbalanced datasets
mnb2 = ComplementNB()
mnb2.fit(X_train2[:,:-3], y_train2)
print("Test Seen")
y_pred2= mnb2.predict(X_test2[:,:-3])
print_metrics(y_pred2, y_test2)

print("\nDev Seen")
y_pred_t= mnb2.predict(X_dev2[:,:-3])
print_metrics(y_pred_t, y_dev2)

Test Seen
accuracy: 0.58
f1 score: 0.573170731707317
AUROC: 0.5798915043766302
Recall: 0.5708502024291497
Precision: 0.5755102040816327

Dev Seen
accuracy: 0.584
f1 score: 0.5953307392996109
AUROC: 0.5848001157351593
Recall: 0.5730337078651685
Precision: 0.6194331983805668


In [17]:
#As opposed to earlier though, we see a great improvement in precision
mnb2 = MultinomialNB()
mnb2.fit(X_train2[:,:-3], y_train2)
print("Test Seen")
y_pred2= mnb2.predict(X_test2[:,:-3])
print_metrics(y_pred2, y_test2)

print("\nDev Seen")
y_pred_t= mnb2.predict(X_dev2[:,:-3])
print_metrics(y_pred_t, y_dev2)

Test Seen
accuracy: 0.58
f1 score: 0.573170731707317
AUROC: 0.5798915043766302
Recall: 0.5708502024291497
Precision: 0.5755102040816327

Dev Seen
accuracy: 0.584
f1 score: 0.5953307392996109
AUROC: 0.5848001157351593
Recall: 0.5730337078651685
Precision: 0.6194331983805668


In [18]:
#Although the precision is still low, we see a large improvement from previous case. We however know that SVMs do not perfrom
# well on larger datasets, and our feature space is particularly large
svm_classifier2 = SVC()
svm_classifier2.fit(X_train2, y_train2)
y_pred2= svm_classifier2.predict(X_test2)
print_metrics(y_pred2, y_test2)

accuracy: 0.58
f1 score: 0.44297082228116713
AUROC: 0.5968585309617919
Recall: 0.6325757575757576
Precision: 0.3408163265306122


In [19]:
lrcf = LogisticRegression()
lrcf.fit(X_train2, y_train2)
print("Test Seen")
y_pred2= lrcf.predict(X_test2)
print_metrics(y_pred2, y_test2)

print("\nDev Seen")
y_pred2= lrcf.predict(X_dev2)
print_metrics(y_pred2, y_dev2)

Test Seen
accuracy: 0.594
f1 score: 0.5072815533980582
AUROC: 0.601913290536045
Recall: 0.625748502994012
Precision: 0.42653061224489797

Dev Seen
accuracy: 0.586
f1 score: 0.5174825174825175
AUROC: 0.5911085769576336
Recall: 0.6098901098901099
Precision: 0.4493927125506073


In [20]:
#As we can see, when we use the original imbalanced dataset is giving poor results
lrcf = LogisticRegression()
lrcf.fit(X_train, y_train)
print("Test Seen")
y_pred= lrcf.predict(X_test)
print_metrics(y_pred, y_test)

print("\nDev Seen")
y_pred= lrcf.predict(X_dev)
print_metrics(y_pred, y_dev)

Test Seen
accuracy: 0.571
f1 score: 0.362555720653789
AUROC: 0.6081191350469195
Recall: 0.6666666666666666
Precision: 0.24897959183673468

Dev Seen
accuracy: 0.544
f1 score: 0.3522727272727273
AUROC: 0.5610608800482217
Recall: 0.5904761904761905
Precision: 0.25101214574898784


We can now also try oversampling the minority class to see if there are any changes

In [21]:
class_0_data = train_df[train_df['label'] == 0]
class_1_data = train_df[train_df['label'] == 1]
difference= len(class_0_data)-len(class_1_data)
oversampled_class_1 = class_1_data.sample(n=difference, random_state=42) 
balanced_df2 = pd.concat([class_0_data, class_1_data, oversampled_class_1])
balanced_df2 = balanced_df2.sample(frac=1, random_state=42).reset_index(drop=True)

In [22]:
vec3, voc3 = vectorizer(balanced_df2, "text")
vec_dev3, _ = vectorizer(dev_seen_df, "text", voc3)
vec_test3, _ = vectorizer(test_seen_df, "text", voc3)

X_train3, y_train3 = make_numpy_matrix(vec3, balanced_df2)
X_test3, y_test3 = make_numpy_matrix(vec_test3, test_seen_df)
X_dev3, y_dev3 = make_numpy_matrix(vec_dev3, dev_seen_df)

In [23]:
#There is a slight decrease in performance, but that is expected, because we are repeating information, bringing in redundancy
#Yet, even in this case, all the metrics are above 0.5 indicating some information or pattern has been learnt
mnb3 = MultinomialNB()
mnb3.fit(X_train3[:,:-3], y_train3)
print("Test Seen")
y_pred3= mnb3.predict(X_test3[:,:-3])
print_metrics(y_pred3, y_test3)

print("\nDev Seen")
y_pred_t3= mnb3.predict(X_dev3[:,:-3])
print_metrics(y_pred_t3, y_dev3)

Test Seen
accuracy: 0.587
f1 score: 0.5525460455037919
AUROC: 0.5872262342624159
Recall: 0.5889145496535797
Precision: 0.5204081632653061

Dev Seen
accuracy: 0.57
f1 score: 0.5548654244306418
AUROC: 0.5698831535695943
Recall: 0.5677966101694916
Precision: 0.5425101214574899


In [24]:
mnb3 = ComplementNB()
mnb3.fit(X_train3[:,:-3], y_train3)
print("Test Seen")
y_pred3= mnb3.predict(X_test3[:,:-3])
print_metrics(y_pred3, y_test3)

print("\nDev Seen")
y_pred_t3= mnb3.predict(X_dev3[:,:-3])
print_metrics(y_pred_t3, y_dev3)

Test Seen
accuracy: 0.587
f1 score: 0.5525460455037919
AUROC: 0.5872262342624159
Recall: 0.5889145496535797
Precision: 0.5204081632653061

Dev Seen
accuracy: 0.57
f1 score: 0.5548654244306418
AUROC: 0.5698831535695943
Recall: 0.5677966101694916
Precision: 0.5425101214574899


In [25]:
#We clearly see a drop in performance here as well, reinforcing our idea that undersampling is more suited in our problem
clf_lr2 = LogisticRegression()
clf_lr2.fit(X_train3, y_train3)
print("Test Seen")
y_pred3= clf_lr2.predict(X_test3)
print_metrics(y_pred3, y_test3)

print("\nDev Seen")
y_pred_t3= clf_lr2.predict(X_dev3)
print_metrics(y_pred_t3, y_dev3)

### Original

| Model | Dataset | Accuracy | F1 Score | AUROC | Recall | Precision |
|-------|---------|----------|----------|-------|--------|-----------|
| Complement Naive Bayes | Test Seen | 0.582 | 0.517 | 0.585 | 0.596 | 0.457 |
|                        | Dev Seen  | 0.560 | 0.518 | 0.561 | 0.565 | 0.478 |
| Multinomial Naive Bayes | Test Seen | 0.578 | 0.443 | 0.593 | 0.627 | 0.343 |
|                          | Dev Seen  | 0.564 | 0.438 | 0.576 | 0.603 | 0.344 |
| Logistic Regression     | Test Seen | 0.571 | 0.363 | 0.608 | 0.667 | 0.249 |
|                          | Dev Seen  | 0.544 | 0.352 | 0.561 | 0.590 | 0.251 |



### Undersampling 

| Model | Dataset | Accuracy | F1 Score | AUROC | Recall | Precision |
|-------|---------|----------|----------|-------|--------|-----------|
| Complement Naive Bayes | Test Seen | 0.58 | 0.573 | 0.580 | 0.571 | 0.576 |
|                        | Dev Seen  | 0.584 | 0.595 | 0.585 | 0.573 | 0.619 |
| Multinomial Naive Bayes | Test Seen | 0.58 | 0.573 | 0.580 | 0.571 | 0.576 |
|                          | Dev Seen  | 0.584 | 0.595 | 0.585 | 0.573 | 0.619 |
| Logistic Regression     | Test Seen | 0.594 | 0.507 | 0.602 | 0.626 | 0.427 |
|                        | Dev Seen  | 0.586 | 0.517 | 0.591 | 0.610 | 0.449 |


### Oversampling

| Model | Dataset | Accuracy | F1 Score | AUROC | Recall | Precision |
|-------|---------|----------|----------|-------|--------|-----------|
| Complement Naive Bayes | Test Seen | 0.587 | 0.553 | 0.587 | 0.589 | 0.520 |
|                        | Dev Seen  | 0.570 | 0.555 | 0.570 | 0.568 | 0.543 |
| Multinomial Naive Bayes | Test Seen | 0.587 | 0.553 | 0.587 | 0.589 | 0.520 |
|                          | Dev Seen  | 0.570 | 0.555 | 0.570 | 0.568 | 0.543 |
| Logistic Regression     | Test Seen | 0.591 | 0.475 | 0.606 | 0.640 | 0.378 |
|                          | Dev Seen  | 0.564 | 0.468 | 0.570 | 0.589 | 0.389 |


## Conclusion:

1. We see that SVMs do not work as a preliminary model for textual analysis because SVMs perform poorly on large datasets, and additionally, they also do not perform well on imbalanced datasets. We see an improvement in their performance (albeit slightly) with undersampling, yet they do not perfrom sufficiently well so their use was subsequently eliminated

2. We see that there is a significant improvement in the metrics (primarily precision) when we undersample and oversample the dataset as compared to the original dataset. However, we see that undersampling gives better results as opposed to oversampling. This might be because we are repeating the data which results in a repeat of the vocabulary, and as a result, some words that are common to both offensive and non offensive become more likely to be classified as offensive resulting in a greater number of false positives. It is important to note that we are repeating a lot of rows while oversampling due to a huge imbalance in dataset. 

3. Other methods of dataset imbalance removal such as SMOTE are known to not work well for textual data and since our data is primarily from NLP tasks, we do not use SMOTE

4. It is also important to note that the values for undersampling, although better, are still very low compared to human performance as well as baseline and SOTA models. This is because we are solely using textual data for our models, and this completely eliminates the information from the images. The hateful meme classification task is intended to be multimodal in nature and our model does not take into account the image modality.