In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import nltk
import string

from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.utils.multiclass import unique_labels
from zipfile import ZipFile

In [2]:
with ZipFile("gbv.zip","r") as zip:
    zip.extractall()

df = pd.read_csv("Train.csv")
print(df["tweet"])
print(unique_labels(df["type"]))

0        Had a dream i got raped last night. By a guy i...
1        he thought the word raped means sex and told m...
2        She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...
3        I was sexually abused for 3 years at age 4 to ...
4        Chessy Prout can do better by telling the trut...
                               ...                        
39645    ENTRY 1299: 21F. 23M, BF’s cousin. Got drunk o...
39646    So you’re telling me Emmanuel Macron was groom...
39647    My wife regularly beats me, I get dirty slaps ...
39648    Me: Hey babe! Police officer boyfriend: is tha...
39649    “I will take accountability if you think it’s ...
Name: tweet, Length: 39650, dtype: object
['Harmful_Traditional_practice' 'Physical_violence' 'economic_violence'
 'emotional_violence' 'sexual_violence']


In [3]:
def clean_text(x):
    x = x.lower()
    x = x.encode("ascii","ignore").decode()
    x = re.sub("https*\S+"," ",x)
    x = re.sub("@\S+"," ",x)
    x = re.sub("#\S+"," ",x)
    x = re.sub("\'\w+","",x)
    x = re.sub("[%s]" % re.escape(string.punctuation)," ",x)
    x = re.sub("\w*\d+\w*","",x)
    x = re.sub("\s{2,}"," ",x)
    return x

temp = []
data_to_list = df["tweet"]

for i in range(len(data_to_list)):
    temp.append(clean_text(data_to_list[i]))

def tokenize(y):
    for x in y:
        yield(word_tokenize(str(x)))

data_words = list(tokenize(temp))

def detokenize(txt):
    return TreebankWordDetokenizer().detokenize(txt)

final_data = []
for i in range(len(data_words)):
    final_data.append(detokenize(data_words[i]))

print(final_data[:5])
final_data = np.array(final_data)

['had a dream i got raped last night by a guy i work with actually a guy i smoked with once at my house but he was doing too much tryna be sexual and it wasnt even like that for me just wanted to smoke', 'he thought the word raped means sex and told me i saw our dogs raping eachother and i was like wtf', 'she not talking to me i was raped by men molested he in jail on nother charge so she not saying word', 'i was sexually abused for years at age to no one believed me i was raped by my bros friend in a classroom at he was i told no one cause who would believe me my bro found out when i was his friend bragged to the wrong person it is hard to come forward', 'chessy prout can do better by telling the truth by not selling owen labrie out to hide whoever else dna was in her underwear she said i have never said he raped me that changed chessy to i was raped i was violated white female privilege allowed her a platform to lie']


In [4]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

In [5]:
max_words = 16000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(final_data)
sequences = tokenizer.texts_to_sequences(final_data)
tweets = pad_sequences(sequences,maxlen=max_len)
with open("tokenizer.pickle","wb") as handle:
	pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)
print(tweets)


dict = {"Harmful_Traditional_practice":0,"Physical_violence":1,
        "economic_violence":2,"emotional_violence":3,
        "sexual_violence":4}
df["labels"] = ""
df["labels"] = df["type"].map(dict)
labels = df["labels"]

x_train,x_test,y_train,y_test = train_test_split(tweets,labels,random_state=42)
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.25,random_state=42)

[[   0    0    0 ...  182    4 1921]
 [   0    0    0 ...    9   34  559]
 [   0    0    0 ...   24  108  480]
 ...
 [   0    0    0 ...   32  166  126]
 [   0    0    0 ... 1673  203  190]
 [   0    0    0 ...  117   77  312]]


In [6]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, GRU, Dense

def model(y):
    x = Embedding(max_words,128)(y)
    x = GRU(64,return_sequences=True)(x)
    x = GRU(64)(x)
    outputs = Dense(5,activation="softmax")(x)
    model = Model(y,outputs)
    return model

model = model(Input(shape=(None,),dtype="int32"))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2048000   
                                                                 
 gru (GRU)                   (None, None, 64)          37248     
                                                                 
 gru_1 (GRU)                 (None, 64)                24960     
                                                                 
 dense (Dense)               (None, 5)                 325       
                                                                 
Total params: 2,110,533
Trainable params: 2,110,533
Non-trainable params: 0
_________________________________________________________________


In [7]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

if __name__=="__main__":
    model.compile(Adam(),SparseCategoricalCrossentropy(),metrics=["accuracy"])
    checkpoint = ModelCheckpoint("gbv.h5",monitor="val_accuracy",save_best_only=True,save_weights_only=False)
    model.fit(x_train,y_train,batch_size=32,epochs=4,validation_data=(x_val,y_val),callbacks=[checkpoint])
    best = load_model("gbv.h5")
    loss,acc = best.evaluate(x_test,y_test,verbose=2)
    print("\nTest acc: {:.2f} %".format(100*acc))
    print("Test loss: {:.2f} %".format(100*loss))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
310/310 - 2s - loss: 0.0114 - accuracy: 0.9973 - 2s/epoch - 7ms/step

Test acc: 99.73 %
Test loss: 1.14 %
