In [1]:
import pandas as pd
import re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
comments_df = pd.read_csv('edos_labelled_data.csv') 
le = preprocessing.LabelEncoder()
comments_df["label"] = le.fit_transform(comments_df["label"])
comments_df.head()



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/miltonrue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,rewire_id,text,label,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",0,train
1,sexism2022_english-16993,"Then, she's a keeper. 😉",0,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,0,train
3,sexism2022_english-13021,woman?,0,train
4,sexism2022_english-966,I bet she wished she had a gun,0,train


In [2]:
comments_train_df = comments_df[comments_df["split"] == "train"]
comments_test_df = comments_df[comments_df["split"] == "test"]
X_train = comments_train_df["text"]
Y_train = comments_train_df["label"]
X_test = comments_test_df["text"]
Y_test = comments_test_df["label"]
X_train.shape

(4193,)

In [3]:
def clean(comments: list[str]) -> list[str]:
    wln = WordNetLemmatizer()
    # remove unicode
    # remove punct
    comments_clean = [comment.encode("ascii", "ignore").decode() for comment in comments]
    comments_clean = list(map(lambda x : x.lower(), comments_clean))
    comments_clean = [re.sub(r'(#\w+|\[user\]|\[url\])', '', comment) for comment in comments_clean]
    translator = str.maketrans('', '', string.punctuation)
    comments_clean = [comment.translate(translator) for comment in comments_clean]
    #comments_clean = [[wln.lemmatize(word.strip()) for word in comment.split()] for comment in comments_clean]

    return comments_clean

In [4]:
def toWordFreqDF(x, y):
    clean_x = clean(x)
    vectorizer = CountVectorizer()
    vec = vectorizer.fit_transform(clean_x)
    frequency_df = pd.DataFrame(vec.toarray(), columns=vectorizer.get_feature_names_out())
    frequency_df['_label'] = y.tolist()
    frequency_df['_label'].tail()
    return frequency_df
train_freq = toWordFreqDF(X_train, Y_train)
test_freq = toWordFreqDF(X_test, Y_test)


In [5]:
sexist_comment_percent = len(Y_train.loc[Y_train== 1]) / len(Y_train) 
common_freq = train_freq.sum().loc[train_freq.sum() >= 5]
sexist = train_freq[train_freq["_label"] == 1].sum().loc[train_freq.sum() >= 5]
ratio  = sexist/common_freq.sort_values()
bad_words = ratio.loc[ratio > 1 - sexist_comment_percent + .05].index

def test_word(sentence : str):
    sexist = 0
    for word in sentence.split():
        if word in bad_words:
            sexist = 1
            
    return sexist

predict = []
print()
for comment in clean(X_test):
    predict.append(test_word(comment))
print(sum(predict))


149


In [6]:
from sklearn.metrics import f1_score 

mid_words = ratio.loc[
    (1 - sexist_comment_percent + .05 > ratio) 
    & (ratio > 0.6)
].index
print(mid_words)
good_words = ratio.loc[sexist_comment_percent - .05 > ratio].index

def test_word2(sentence : str, badVal, midVal, cutoff):
    sexist = 0
    for word in sentence.split():
        if word in bad_words:
            sexist += badVal
        elif word in mid_words:
            sexist += midVal
    sexist = 1 if sexist >= cutoff else 0
    return sexist

predict2 = []
best_F1 = [0]
def find_best_variables():
    global best_F1
    for bad in range(0, 0): # 40, 5 3 5 F1 = 0.7904021016988467
        for mid in range(0, bad):
            for cut in range(5, 100):
                predict2 = []
                for comment in clean(X_test):
                    predict2.append(test_word2(comment, (bad/10), (mid/10), (cut/10)))
                f1 = f1_score(Y_test, predict2, average = "weighted")
                print(str(bad) + " " + str(mid) + " " + str(cut), end = "\r")
                if(f1 > best_F1[0]):
                    best_F1 = []
                    best_F1.append(f1)
                    best_F1.append([bad, mid, cut])
                    print(str(bad) + " " + str(mid) + " " + str(cut) + " F1 = " + str(f1))
find_best_variables()
print(best_F1)

Index(['17', '40', 'above', 'alpha', 'attention', 'british', 'bs', 'chad',
       'charge', 'cheating', 'club', 'completely', 'confident', 'disgusting',
       'equal', 'exist', 'exposed', 'failure', 'fantasy', 'feminism',
       'finally', 'greatest', 'gross', 'hahaha', 'handle', 'higher',
       'immediately', 'learn', 'lifetime', 'loved', 'low', 'lying', 'market',
       'marriage', 'mode', 'modern', 'motherhood', 'movies', 'normies',
       'obese', 'opposite', 'option', 'patriarchy', 'physically', 'rich',
       'screw', 'simp', 'sluts', 'species', 'stick', 'successful', 'sucked',
       'tits', 'tons', 'tranny', 'treat', 'turns', 'typical', 'unfortunately',
       'validation', 'victims', 'west', 'whenever', 'wing'],
      dtype='object')
[0]


In [7]:
#from sklearn.model_selection import train_test_split
#Our_X_Train, Our_X_Test = train_test_split(X_train, test_size=0.2, shuffle=False)
#Our_Y_Train, Our_Y_Test = train_test_split(Y_train, test_size=0.2, shuffle=False)

In [8]:
def find_best_ranges():
    f1_scores = []
    best_f1 = [0]
    for bad in range(0,15):
        for mid in range(6, int((1 - sexist_comment_percent + bad/100)*20)):
            test_range(bad/100, 0.05 * mid, f1_scores, best_f1)
    return f1_scores

def test_range(bad, mid, f1_scores, best_f1):
    bad_words = ratio.loc[ratio >= 1 - sexist_comment_percent + bad].index
    mid_words = ratio.loc[
    (1 - sexist_comment_percent + bad > ratio) 
    & (ratio > mid)].index
    

    for midVal in range(0,6):
        for cutoff in range(5,31):
            print(str(bad) + " " + str(mid) + " " + str(midVal) + " " + str(cutoff) + " ", end = "\r")
            predictions = []
            for comment in clean(X_train):
                predictions.append(test_word3(comment, midVal, cutoff, bad_words, mid_words))
            f1 = f1_score(Y_train, predictions, average = "weighted")
            f1_scores.append([f1, bad, mid, midVal, cutoff])
            if(f1 > best_f1[0]):
                best_f1[0] = f1
                print(str(f1) + " " + str(bad) + " " + str(mid) + " " + str(midVal) + " " + str(cutoff))
                
    
def test_word3(sentence : str, midVal, cutoff, bad_words, mid_words):
    sexist = 0
    for word in sentence.split():
        if word in bad_words:
            sexist += 10
        elif word in mid_words:
            sexist += midVal
    sexist = 1 if sexist >= cutoff else 0
    return sexist

print(find_best_ranges())

0.7789432354510586 0.0 0.30000000000000004 0 5
0.7829160467761055 0.0 0.35000000000000003 1 9
0.792841730559442 0.0 0.35000000000000003 1 10
0.7975281156078042 0.0 0.35000000000000003 1 11
0.8118695898697101 0.0 0.4 1 5
0.8161997121844654 0.0 0.45 2 5
0.8174426501524505 0.01 0.45 2 5
0.8178587193358414 0.02 0.45 2 5
[[0.7789432354510586, 0.0, 0.30000000000000004, 0, 5], [0.7789432354510586, 0.0, 0.30000000000000004, 0, 6], [0.7789432354510586, 0.0, 0.30000000000000004, 0, 7], [0.7789432354510586, 0.0, 0.30000000000000004, 0, 8], [0.7789432354510586, 0.0, 0.30000000000000004, 0, 9], [0.7789432354510586, 0.0, 0.30000000000000004, 0, 10], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 11], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 12], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 13], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 14], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 15], [0.6336530975270807, 0.0, 0.30000000000000004, 0, 16], [0.6336530975270807, 0.0, 

In [None]:
def find_best_ranges_neg():
    f1_scores = []
    best_f1 = [0]
    for bad in range(1,15):
        for mid in range(6, int((1 - sexist_comment_percent + bad/100)*20)):
            test_range(bad/-100, 0.05 * mid, f1_scores, best_f1)
    return f1_scores

neg_f1_scores = find_best_ranges_neg()
print(neg_f1_scores)

0.7789432354510586 -0.01 0.30000000000000004 0 5
0.7829160467761055 -0.01 0.35000000000000003 1 9
0.792841730559442 -0.01 0.35000000000000003 1 10
0.7975281156078042 -0.01 0.35000000000000003 1 11
0.8118695898697101 -0.01 0.4 1 5
0.8161997121844654 -0.01 0.45 2 5
-0.03 0.45 3 15 000000003 5 29 

In [26]:
# 0, 0.45, 2, 5

bad_words = ratio.loc[ratio >= 1 - sexist_comment_percent - 0.01].index
mid_words = ratio.loc[(1 - sexist_comment_percent - 0.01 > ratio) 
    & (ratio > 0.45)].index

predictions = []
for comment in clean(X_test):
    predictions.append(test_word3(comment, 2, 5, bad_words, mid_words))
    
print(f1_score(Y_test, predictions, average = "weighted"))

0.7880027200175773


In [15]:
from sklearn.metrics import classification_report
def eval_predictions(pred):
    print(classification_report(Y_test, pred))

import random
baseline = [0] * len(Y_test)
baseline2 = [int(random.uniform(0, 1) > .8) for i in predict]

print("BASE")
eval_predictions(baseline)
print("RAND")
eval_predictions(baseline2)
print("BAD")
eval_predictions(predict)
print("MID")
eval_predictions(predictions)

BASE
              precision    recall  f1-score   support

           0       0.73      1.00      0.84       789
           1       0.00      0.00      0.00       297

    accuracy                           0.73      1086
   macro avg       0.36      0.50      0.42      1086
weighted avg       0.53      0.73      0.61      1086

RAND
              precision    recall  f1-score   support

           0       0.73      0.83      0.78       789
           1       0.30      0.20      0.24       297

    accuracy                           0.66      1086
   macro avg       0.52      0.51      0.51      1086
weighted avg       0.62      0.66      0.63      1086

BAD
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       789
           1       0.80      0.40      0.53       297

    accuracy                           0.81      1086
   macro avg       0.80      0.68      0.71      1086
weighted avg       0.81      0.81      0.78      1086

MID
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
