In [1]:
import re

In [3]:
import nltk
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [26]:
df = pd.read_csv('labeled_data.csv')

In [27]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [28]:
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ai_ds_a1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ai_ds_a1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ai_ds_a1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ai_ds_a1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
# Tweet Preprocessing

def pre_processing(tweet: str):
    
    # Remove Leading Blank Spaces
    tweet = tweet.strip()
    
    # Lower Case
    tweet = tweet.lower()
    
    # Remove URLS 
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    tweet = re.sub(url_pattern, "", tweet)
    
    # Remove UserName
    username_pattern = re.compile(r"@\w+")
    tweet = re.sub(username_pattern, "", tweet)
    
    # Remove Hashtags
    hashtag_pattern = re.compile(r"#\w+")
    tweet = re.sub(hashtag_pattern, "", tweet)
    
    # Character normalization // todaaaaay -> today
    tweet = re.sub(r"([a-zA-Z])\1{2,}", r'\1', tweet)
    
    # Remove Special Characters
    tweet = re.sub(r'[^a-zA-Z\s]', "", tweet)
    
    # Word Tokenizer
    tweet = nltk.word_tokenize(tweet)
    
    # Remove Stop Words 
    stop_words = set([re.sub(r'[^a-zA-Z\s]', "", word) for word in nltk.corpus.stopwords.words("english")])
    tweet = [word for word in tweet if word not in stop_words and word != "rt"]
    
    # lemmatization
    def get_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
        return tag_dict.get(tag, "n")
    
    lemma = nltk.stem.WordNetLemmatizer()
    tweet = [lemma.lemmatize(word, pos=get_pos(word)) for word in tweet]
    
    return tweet

df["pre-tweet"] = df["tweet"].apply(pre_processing)

pre_processing("i like to eat pizza @napolean57pizzeriaaa it is simplyyy amaziiiing!!!")

['like', 'eat', 'pizza', 'simply', 'amaze']

In [31]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,pre-tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,"[woman, complain, cleaning, house, amp, man, a..."
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho..."
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,"[dawg, ever, fuck, bitch, start, cry, confuse,..."
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,"[look, like, tranny]"
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,"[shit, hear, might, true, might, faker, bitch,..."


In [32]:
df = df[["pre-tweet","class"]]

In [33]:
df.head(5)

Unnamed: 0,pre-tweet,class
0,"[woman, complain, cleaning, house, amp, man, a...",2
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",1
2,"[dawg, ever, fuck, bitch, start, cry, confuse,...",1
3,"[look, like, tranny]",1
4,"[shit, hear, might, true, might, faker, bitch,...",1


In [34]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df["pre-tweet"].values, df["class"].values, train_size=0.8)

In [35]:
x_test

array([list(['hey', 'call', 'bitch']),
       list(['hoe', 'talent', 'da', 'girl', 'put', 'book', 'info', 'bio', 'get', 'k', 'follower']),
       list(['see', 'nig', 'today', 'somebody', 'understands', 'music', 'like']),
       ..., list(['hoe', 'steady', 'come']), list([]),
       list(['wifey', 'bitch', 'money', 'cuz', 'money', 'nothing', 'life', 'blur', 'way', 'go'])],
      dtype=object)

In [38]:
df[df["class"] == 1] 

Unnamed: 0,pre-tweet,class
1,"[boy, dat, coldtyga, dwn, bad, cuffin, dat, ho...",1
2,"[dawg, ever, fuck, bitch, start, cry, confuse,...",1
3,"[look, like, tranny]",1
4,"[shit, hear, might, true, might, faker, bitch,...",1
5,"[shit, blow, meclaim, faithful, somebody, stil...",1
...,...,...
24774,"[really, care, bout, dis, bitch, dick, yo, fee...",1
24775,"[worried, bout, bitch, need]",1
24778,"[yous, muthafin, lie, right, tl, trash, mine, ...",1
24780,"[young, buck, wan, na, eat, dat, nigguh, like,...",1


In [39]:
df[df["class"] == 0]

Unnamed: 0,pre-tweet,class
85,"[queer, gaywad]",0
89,"[alsarabs, he, beaner, smh, tell, he, mexican]",0
110,"[fuck, gay, blacklist, hoe, hold, anyway]",0
184,"[lmfao, hate, black, people, there, black, peo...",0
202,"[least, im, nigger, lmfao]",0
...,...,...
24576,"[guy, big, faggot, omfg]",0
24685,"[one, name, offensive, kike, wop, kraut, wetba...",0
24751,"[pussy, as, nigga, know, nigga]",0
24776,[nigger],0


In [40]:
df[df["class"] == 2]

Unnamed: 0,pre-tweet,class
0,"[woman, complain, cleaning, house, amp, man, a...",2
40,"[momma, say, pussy, cat, inside, doghouse]",2
63,"[simplyaddictedtoguys, woof, woof, hot, scally...",2
66,"[woof, woof, hot, sol]",2
67,"[lemmie, eat, oreo, amp, dish, one, oreo, lol]",2
...,...,...
24736,"[yaya, ho, cute, avi, tho, idea, sleep]",2
24737,"[yea, new, friend, friend, kno, theyre, allow,...",2
24767,"[know, say, early, bird, get, worm, put, gummy...",2
24779,"[go, broke, wrong, heart, baby, drove, redneck...",2


In [41]:
vocab = set()

for words in x_train:
    for word in words:
        vocab.add(word)
        
print("Vocab Size :", len(vocab))

Vocab Size : 14581


In [42]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[K     |████████████████████████████████| 26.6 MB 11.7 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 1.3 MB/s  eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-6.4.0
Note: you may need to restart the kernel to use updated packages.


In [43]:
from gensim.models import Word2Vec

g_model = Word2Vec(vector_size=200, window=5, workers=5)
g_model.build_vocab(x_train)
g_model.train(x_train, total_examples=g_model.corpus_count, epochs=500)

(51763321, 73511500)

In [44]:
def in_vocab(word_l):
    for word in word_l:
        if word not in g_model.wv:
            return False
    else:
        return True

train_vec = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in x_train]
test_vec  = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in x_test]

In [46]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_vec, y_train)

from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_vec)
print("Accuracy Score :", accuracy_score(y_test, predict), end='\n\n')
print(classification_report(y_true = y_test, y_pred = predict))

Accuracy Score : 0.79382691143837

              precision    recall  f1-score   support

           0       0.38      0.07      0.12       279
           1       0.80      0.98      0.88      3849
           2       0.70      0.17      0.28       829

    accuracy                           0.79      4957
   macro avg       0.63      0.41      0.43      4957
weighted avg       0.76      0.79      0.74      4957

