In [24]:
import pandas as pd 
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [25]:
def print_metrics(y_pred2, y_dev):
    print(f"accuracy: {metrics.accuracy_score(y_pred2, y_dev)}")
    print(f"f1 score: {metrics.f1_score(y_pred2, y_dev)}")
    print(f"AUROC: {metrics.roc_auc_score(y_pred2, y_dev)}")
    print(f"Recall: {metrics.recall_score(y_pred2, y_dev)}") 
    print(f"Precision: {metrics.precision_score(y_pred2, y_dev)}")
    
df= pd.read_pickle("./catalog.pkl")

In [26]:
df_train= df[["train" in val for val in df["sets"]]].drop(["file", "sets"], axis=1)
df_test= df[["test_seen" in val for val in df["sets"]]].drop(["file", "sets"], axis=1)
df_val= df[["dev_seen" in val for val in df["sets"]]].drop(["file", "sets"], axis=1)
df_test2= df[["test_unseen" in val for val in df["sets"]]].drop(["file", "sets"], axis=1)
df_val2= df[["dev_unseen" in val for val in df["sets"]]].drop(["file", "sets"], axis=1)

X_test, y_test= df_test.drop(["label"], axis=1) , df_test["label"]
X_val, y_val= df_val.drop(["label"], axis=1) , df_val["label"]
X_test2, y_test2= df_test2.drop(["label"], axis=1) , df_test2["label"]
X_val2, y_val2= df_val2.drop(["label"], axis=1) , df_val2["label"]

In [27]:
class_0_data = df_train[df_train['label'] == 0]
class_1_data = df_train[df_train['label'] == 1]
oversampled_class_1 = class_1_data.sample(n=2462, random_state=42) #there are 3019 label 1 rows
oversampled_df = pd.concat([class_0_data, oversampled_class_1, class_1_data])
oversampled_df = oversampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

undersampled_class_0 = class_0_data.sample(n=3019, random_state=42) #there are 3019 label 1 rows
undersampled_df = pd.concat([undersampled_class_0, class_1_data])
undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

smote = SMOTE(random_state=43)
X_train, y_train= df_train.drop(["label"], axis=1) , df_train["label"]
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_oversampled, y_train_oversampled = oversampled_df.drop(["label"], axis=1) , oversampled_df["label"]
X_train_undersampled, y_train_undersampled = undersampled_df.drop(["label"], axis=1) , undersampled_df["label"]

In [28]:
X_set= [X_train, X_train_undersampled, X_train_oversampled, X_train_smote]
y_set= [y_train, y_train_undersampled, y_train_oversampled, y_train_smote]
strv=["ORIGINAL", "UNDERSAMPLED", "OVERSAMPLED", "SMOTE"]

for i in range(4):
    print(strv[i])
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_set[i], y_set[i])

    print("Dev Seen: ")
    y_val_pred = rf_classifier.predict(X_val)
    print_metrics(y_val, y_val_pred)
    print("\nTest Seen: ")
    y_test_pred = rf_classifier.predict(X_test)
    print_metrics(y_test, y_test_pred)
    
    print("\nDev Unseen: ")
    y_val_pred2 = rf_classifier.predict(X_val2)
    print_metrics(y_val2, y_val_pred2)
    print("\nTest Unseen: ")
    y_test_pred2 = rf_classifier.predict(X_test2)
    print_metrics(y_test2, y_test_pred2)
    print("\n\n")

ORIGINAL
Dev Seen: 
accuracy: 0.506
f1 score: 0.2108626198083067
AUROC: 0.5015842281288505
Recall: 0.13360323886639677
Precision: 0.5

Test Seen: 
accuracy: 0.5
f1 score: 0.21630094043887146
AUROC: 0.49295718287314927
Recall: 0.14081632653061224
Precision: 0.46621621621621623

Dev Unseen: 
accuracy: 0.5944444444444444
f1 score: 0.1978021978021978
AUROC: 0.4998529411764706
Recall: 0.135
Precision: 0.3698630136986301

Test Unseen: 
accuracy: 0.5905
f1 score: 0.2222222222222222
AUROC: 0.5035999999999999
Recall: 0.156
Precision: 0.38613861386138615



UNDERSAMPLED
Dev Seen: 
accuracy: 0.48
f1 score: 0.49612403100775193
AUROC: 0.4804531852586772
Recall: 0.5182186234817814
Precision: 0.4758364312267658

Test Seen: 
accuracy: 0.475
f1 score: 0.4956772334293948
AUROC: 0.47601040416166474
Recall: 0.5265306122448979
Precision: 0.46823956442831216

Dev Unseen: 
accuracy: 0.47962962962962963
f1 score: 0.42061855670103093
AUROC: 0.4858823529411765
Recall: 0.51
Precision: 0.35789473684210527

Test U

In [29]:
X_set= [X_train, X_train_undersampled, X_train_oversampled, X_train_smote]
y_set= [y_train, y_train_undersampled, y_train_oversampled, y_train_smote]
strv=["ORIGINAL", "UNDERSAMPLED", "OVERSAMPLED", "SMOTE"]

for i in range(4):
    print(strv[i])
    svm_classifier = SVC()
    svm_classifier.fit(X_set[i], y_set[i])

    print("Dev Seen: ")
    y_val_pred = svm_classifier.predict(X_val)
    print_metrics(y_val, y_val_pred)
    print("\nTest Seen: ")
    y_test_pred = svm_classifier.predict(X_test)
    print_metrics(y_test, y_test_pred)
    
    print("\nDev Unseen: ")
    y_val_pred2 = svm_classifier.predict(X_val2)
    print_metrics(y_val2, y_val_pred2)
    print("\nTest Unseen: ")
    y_test_pred2 = svm_classifier.predict(X_test2)
    print_metrics(y_test2, y_test_pred2)
    print("\n\n")

ORIGINAL
Dev Seen: 
accuracy: 0.51
f1 score: 0.02390438247011952
AUROC: 0.5040965899089469
Recall: 0.012145748987854251
Precision: 0.75

Test Seen: 
accuracy: 0.509
f1 score: 0.00808080808080808
AUROC: 0.4990996398559424
Recall: 0.004081632653061225
Precision: 0.4

Dev Unseen: 
accuracy: 0.6277777777777778
f1 score: 0.028985507246376812
AUROC: 0.5016176470588235
Recall: 0.015
Precision: 0.42857142857142855

Test Unseen: 
accuracy: 0.6255
f1 score: 0.007947019867549669
AUROC: 0.5012
Recall: 0.004
Precision: 0.6



UNDERSAMPLED
Dev Seen: 
accuracy: 0.512
f1 score: 0.47639484978540775
AUROC: 0.5112576210974381
Recall: 0.4493927125506073
Precision: 0.5068493150684932

Test Seen: 
accuracy: 0.549
f1 score: 0.5247629083245522
AUROC: 0.5481992797118848
Recall: 0.5081632653061224
Precision: 0.5424836601307189

Dev Unseen: 
accuracy: 0.5148148148148148
f1 score: 0.4099099099099099
AUROC: 0.5025000000000001
Recall: 0.455
Precision: 0.3729508196721312

Test Unseen: 
accuracy: 0.5615
f1 score: 0.4

In [30]:
X_set= [X_train, X_train_undersampled, X_train_oversampled, X_train_smote]
y_set= [y_train, y_train_undersampled, y_train_oversampled, y_train_smote]
strv=["ORIGINAL", "UNDERSAMPLED", "OVERSAMPLED", "SMOTE"]

for i in range(4):
    print(strv[i])
    lr_classifier = LogisticRegression(max_iter=1000)
    lr_classifier.fit(X_set[i], y_set[i])

    print("Dev Seen: ")
    y_val_pred = lr_classifier.predict(X_val)
    print_metrics(y_val, y_val_pred)
    print("\nTest Seen: ")
    y_test_pred = lr_classifier.predict(X_test)
    print_metrics(y_test, y_test_pred)
    
    print("\nDev Unseen: ")
    y_val_pred2 = lr_classifier.predict(X_val2)
    print_metrics(y_val2, y_val_pred2)
    print("\nTest Unseen: ")
    y_test_pred2 = lr_classifier.predict(X_test2)
    print_metrics(y_test2, y_test_pred2)
    print("\n\n")

ORIGINAL
Dev Seen: 
accuracy: 0.508
f1 score: 0.0390625
AUROC: 0.5022163191499576
Recall: 0.020242914979757085
Precision: 0.5555555555555556

Test Seen: 
accuracy: 0.514
f1 score: 0.03571428571428571
AUROC: 0.504281712685074
Recall: 0.018367346938775512
Precision: 0.6428571428571429

Dev Unseen: 
accuracy: 0.6296296296296297
f1 score: 0.047619047619047616
AUROC: 0.5051470588235295
Recall: 0.025
Precision: 0.5

Test Unseen: 
accuracy: 0.6235
f1 score: 0.03088803088803089
AUROC: 0.502
Recall: 0.016
Precision: 0.4444444444444444



UNDERSAMPLED
Dev Seen: 
accuracy: 0.5
f1 score: 0.5454545454545454
AUROC: 0.50127218319438
Recall: 0.6072874493927125
Precision: 0.49504950495049505

Test Seen: 
accuracy: 0.512
f1 score: 0.5555555555555556
AUROC: 0.5141656662665066
Recall: 0.6224489795918368
Precision: 0.5016447368421053

Dev Unseen: 
accuracy: 0.4685185185185185
f1 score: 0.4595103578154426
AUROC: 0.49764705882352933
Recall: 0.61
Precision: 0.3685800604229607

Test Unseen: 
accuracy: 0.491
f1

## Conclusion:

1. Precision= True Positives / (True Positives + False Positives). This means that having a fairly high precision implies that the false positives are lower. This means, chances of my model saying that something non-offensive (class 0) is actually offensive (class 1) is lower. 
2. Recall= True Positives / (True Positives + False Negatives). This means that having a fairly low recall means greater number of false negatives, ie if my meme is offensive in reality, but my model ends up predicting it as non-offensive
3. In summary, when my model is predicting something to be offensive, it is correct, but the model ends up missing a lot of the ofensive memes too. This is consistent with our hypothesis- that using only text or only image is not going to be helpful, because we lose out on critical information. When we are doing classification based on an image catalogue, we end up ignoring half the information- the caption (and in many cases, captions do change the meaning). At the same time, we are also ignoring some information from the image too, as we have identified what is in the image, but not what they are doing - ie, the actions. 
4. However, when we apply dataset balancing techniques such as SMOTE, Undersampling and oversampling, the performance improves significantly in terms of recall and f1 score. Conventionally used classification models such as SVMs, Logistic Regression and Random Forests are sensitive to unbalanced data.
5. Despite this, we still notice that our model does not perform very well, and is still below baselines and human performance and this happens because we are ignoring the text component of our memes completely. 