<h1 style="text-align:center;">Offensive Language Classification</h1>

In [1]:
#import essential modules
import pandas as pd
import collections
import tensorflow as tf
import numpy as np

### We'll begin by loading and cleaning the dataset

In [None]:
#create df object to store dataset
df_train=pd.read_csv('OffensiveLanguage/Task1/tamil_offensive_train.tsv', sep='\t')
df_test=pd.read_csv('OffensiveLanguage/Task1/tam_offesive_withoutlabels_test.tsv.csv',sep='\t')

In [7]:
df_train=df_train.astype({'id':'string','text':'string','category':'string'}) #cast values to useable format
df_test=df_test.astype({'id':'string','text':'string','category':'string'}) #cast values to useable format

In [12]:
# optional remove ... from end of comments
for i in df.index:
    df.loc[i,'text']=df.loc[i,'text'].strip('.')
df.head() # notice the text in head() appears to have ... but thats just pandas max-display length limit

Unnamed: 0,id,text,category
0,tam1,திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற...,NOT
1,tam2,இந்த ட்ரெய்லர் கூட பார்க்கிற மாதிரி இல்லை.. இத...,OFF
2,tam3,மைசூரு செட்டியார் சமூகத்தின் சார்பாக இப்படம் வ...,NOT
3,tam4,மொத்த சாதியும் ஒரு சாதிக்கு எதிரா நிக்குது.......,OFF
4,tam5,only for விஜய் சேதுபதி and STR,NOT


In [6]:
# add label coloumn in train df to convert catgories to machine usable labels
df_train.insert(3,'label','_')
df_train.head()

Unnamed: 0,id,text,category,label
0,tam1,திருமலை நாயக்கர் பேரவை சார்பாக படம் வெற்றி பெற...,NOT,_
1,tam2,இந்த ட்ரெய்லர் கூட பார்க்கிற மாதிரி இல்லை.. இத...,OFF,_
2,tam3,மைசூரு செட்டியார் சமூகத்தின் சார்பாக இப்படம் வ...,NOT,_
3,tam4,மொத்த சாதியும் ஒரு சாதிக்கு எதிரா நிக்குது.......,OFF,_
4,tam5,only for விஜய் சேதுபதி and STR,NOT,_


In [None]:
# convert category to machine usable labels
for i in df_train.index:
    if df_train.loc[i,'category']=='NOT':
        df_train.loc[i,'label']=0
    else:
        df_train.loc[i,'label']=1
df.head()

In [13]:
# function to generate frequency dictionary and return max sentence length
def gen_word_encodings(df,word_freq):
    num_recs=df.index.stop
    max_len=0
    for i in df.index:
        words=df['text'][i].strip().split()
        if(len(words)>max_len):
            max_len=len(words)
        for word in words:
            word_freq[word]+=1
    return max_len

In [14]:
#creating Encodings
word_freq=collections.Counter()
max_len=gen_word_encodings(df_train,word_freq) #these 2steps are important to properly load the data 
#word to index maps as{"word1":idx1,"word2":idx2...}
word2idx={x[0]:i+2 for i,x in enumerate(word_freq.most_common(len(word_freq)))}
word2idx["PAD"]=0
word2idx["UNK"]=1
#idx to word mapping
idx2word={v:k for k,v in word2idx.items()}

In [15]:
len(word_freq)

24754

In [9]:
# this is not needed for dataset but we need this for stripping off the unwanted chars and emojis in dataset
def gen_char_encodings(word_freq,char_freq):
    max_word_len=0
    for keys in word_freq:
        if(len(keys)>max_word_len):
            max_word_len=len(keys)
        chars=list(keys)
        for char in chars:
            char_freq[char]+=1
    return max_word_len

In [10]:
char_freq=collections.Counter()
max_word_len=gen_char_encodings(word_freq,char_freq)
char2idx={x[0]:i+2 for i,x in enumerate(char_freq.most_common(len(char_freq)))}
char2idx["PAD"]=0
char2idx["UNK"]=1
#idx to char mapping
idx2char={v:k for k,v in char2idx.items()}

In [None]:
#max_word_len # input length to embedding vector
char2idx #('🤣', '🤔', '🦁', '🤦', '🤝', '🤗', '🤩', '🤪', '🥭', '🤫', '🤘', '🤬', '🤙', '🥳', '🤨', '🧐', '🥰', '🥇', '🥶', '🥊', '🤛', '🤞', '🤕', '🤭', '🤟', '🤐', '🤺', '🧡', '🦸', '⏮', '⏸', '⏭', '🤧', '\U0001f7e0','🦍', '🧟', '🥁', '🤠', '🦌', '🦄', '🤓', '🧨', '🤮', '⏰', '🦅', '\u2066', '\u2069', '\u200b', '\u200d', '\u200c')

In [11]:
# remove emojis and unwanted characters from the dataframe
for i in df_train.index:
    df_train.loc[i,'text']=df_train.loc[i,'text'].translate({ord(j): None for j in ('🦂','🤣', '🤔', '🦁', '🤦', '🤝', '🤗', '🤩', '🤪', '🥭', '🤫', '🤘', '🤬', '🤙', '🥳', '🤨', '🧐', '🥰', '🥇', '🥶', '🥊', '🤛', '🤞', '🤕', '🤭', '🤟', '🤐', '🤺', '🧡', '🦸', '⏮', '⏸', '⏭', '🤧', '\U0001f7e0','🦍', '🧟', '🥁', '🤠', '🦌', '🦄', '🤓', '🧨', '🤮', '⏰', '🦅', '\u2066', '\u2069', '\u200b', '\u200d', '\u200c','\U000fe4eb')})

# remove emojis and unwanted characters from the dataframe
for i in df_test.index:
    df_test.loc[i,'text']=df_test.loc[i,'text'].translate({ord(j): None for j in ('🦂','🤣', '🤔', '🦁', '🤦', '🤝', '🤗', '🤩', '🤪', '🥭', '🤫', '🤘', '🤬', '🤙', '🥳', '🤨', '🧐', '🥰', '🥇', '🥶', '🥊', '🤛', '🤞', '🤕', '🤭', '🤟', '🤐', '🤺', '🧡', '🦸', '⏮', '⏸', '⏭', '🤧', '\U0001f7e0','🦍', '🧟', '🥁', '🤠', '🦌', '🦄', '🤓', '🧨', '🤮', '⏰', '🦅', '\u2066', '\u2069', '\u200b', '\u200d', '\u200c','\U000fe4eb')})

### we can now create dataset

In [79]:
# function to generate dataset
def create_dataset(df,x,y):
    for i in df.index:
        words=df['text'][i].strip().split()
        seqs=[]
        for word in words:
            if word in word2idx:
                seqs.append(word2idx[word])
            else:
                seqs.append(word2idx["UNK"])
        x[i]=seqs
        cat=df['category'][i]
        if(cat=='OFF'):
            y[i]=1
        else:
            y[i]=0

In [80]:
num_recs=df_train.index.stop
x=np.empty((num_recs,),dtype=list) # x holds sentence vectors 
y=np.zeros((num_recs,),dtype="uint8") # y holds category 1 for Offensive 0 for Not-Offensive
create_dataset(df_train,x,y)
x=tf.keras.preprocessing.sequence.pad_sequences(x,maxlen=50)

In [26]:
lenxt=int(x.shape[0]*0.7)
xt=x[:lenxt] # x-train split with 70% data
yt=y[:lenxt] # y-train split with 70% data
xtt=x[lenxt:] # x-evaluation split with 30% data
ytt=y[lenxt:] # y-evaluation split with 30% data

## Define models

In [16]:
# Test 1 LSTM units 64 perf ~80%
vocab_size=len(word2idx)
model=tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size,128,mask_zero=True,input_length=50),
                                  tf.keras.layers.LSTM(64,dropout=0.25,recurrent_dropout=0.1,return_sequences=True,unroll=True)
                                  tf.keras.layers.Dense(1,activation="sigmoid")])


In [19]:
# Test 2 bidirectional LSTM with 32 units perf ~81%
# Test 3 bidirectional LSTM with 64 units perf ~81% increasing number of units does not effect perf
vocab_size=len(word2idx)
model=tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size,128,input_length=50),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,dropout=0.2,recurrent_dropout=0.2),merge_mode='concat'),
                                tf.keras.layers.Dense(1,activation="sigmoid")])


In [122]:
# Test 4 seperate forward backward LSTM layers in bidirectional with 32 units each perf ~82.5%
# Test 5 seperate forward backward LSTM layers in bidirectional with 16 units each perf ~80%
# Test 6 seperate forward backward LSTM layers in bidirectional with 32 units each increasing dropout to 30% perf ~82%
# Test 7 seperate forward backward LSTM layers in bidirectional with 32 units each decreasing dropout to 25% and 10% perf ~82.7%

vocab_size=len(word2idx)
forward_layer = tf.keras.layers.LSTM(32,dropout=0.25,recurrent_dropout=0.1,return_sequences=False)
backward_layer = tf.keras.layers.LSTM(32,activation='relu',dropout=0.25,recurrent_dropout=0.1,return_sequences=False,go_backwards=True)
model=tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size,128,input_length=50),
                                tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer,merge_mode='concat'),
                                tf.keras.layers.Dense(1,activation="sigmoid")])


### train model

In [112]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
hist=model.fit(x,y,batch_size=32,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [113]:
# save model to hdf5 file
model.save('task1.h5')

In [16]:
model=tf.keras.models.load_model('task1.h5')
model.output_shape

(None, 1)

### Make predictions on test dataset

In [17]:
for i in df_test.index:
    words=df_test.loc[i,'text'].strip().split()
    seqs=[]
    for word in words:
        if word in word2idx:
            seqs.append(word2idx[word])
        else:
            seqs.append(word2idx["UNK"])
    seqs=[seqs]
    seqs=tf.keras.preprocessing.sequence.pad_sequences(seqs,maxlen=50)
    if(model(seqs)[0][0]>0.40):
        preds="OFF"
    else:
        preds="NOT"
    df_test.loc[i,'category']=preds
    
        

In [20]:
# save test predictions to file
df_test.to_csv('OffensiveLanguage/Task1/task1_submission.tsv',sep='\t',index=False)

In [None]:
#plot model AUC curve
from matplotlib import pyplot as plt
plt.figure(figsize=(10,8))
plt.subplot(211)
plt.title('Accuracy')
plt.plot(hist.history["accuracy"],color='b',label='Train')
plt.plot(hist.history["val_accuracy"],color='g',label='Validation')
plt.legend(loc="best")

plt.subplot(212)
plt.title('Loss')
plt.plot(hist.history["loss"],color='#FFA500',label='Train')
plt.plot(hist.history["val_loss"],color='g',label='Validation')
plt.legend(loc="best")

plt.tight_layout()
#plt.savefig("none")
plt.show()