In [75]:
import pandas as pd
import collections
import tensorflow as tf
import numpy as np

In [None]:
df=pd.read_csv('Task2/Tamil_hasoc_tanglish_test_withlabels.tsv',sep='\t',names=['id','text','category'])

In [169]:
df2=pd.read_excel('OffensiveLanguage/Task2/Tamil-Codemixed_offensive_data_Training-Tweet-HL.xlsx',engine='openpyxl',names=['id','text','category'])
df2.head(10)

Unnamed: 0,id,text,category
0,TA_HL101,@Asha Apo neenga atha government ku theriya pa...,NOT
1,TA_HL102,@Bala sundar ayyo sorry...antha line ah explai...,NOT
2,TA_HL105,@kalimuthu ne ena lusa...yaaru edhu panaalum e...,NOT
3,TA_HL109,1st baby ku neat ah feed panunga plzz ipdi iru...,NOT
4,TA_HL113,"2012 il vazhgiromnu iruku ithula, apdina?",NOT
5,TA_HL118,30 varusa kadan. 25 age engayo idikuthe,NOT
6,TA_HL124,a vanitha veliya po ethuku thirumpi vantha,NOT
7,TA_HL125,à®•à¯à®´à®¨à¯à®¤à¯ˆ ga taste ah saptanum nu ...,NOT
8,TA_HL127,Aaiiii Jolly Yellam onnah polam onnah polam oa...,NOT
9,TA_HL128,"aaluku etha mathri pesarathu,thurumbi vanthu p...",NOT


In [170]:
df3=pd.concat([df,df2],axis=0,ignore_index=True) #merge to training sets data
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4939 non-null   object
 1   text      4939 non-null   object
 2   category  4939 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB


In [172]:
df3=df3.astype({'id':'string','text':'string','category':'string'}) # convert to a usable datatype
df3.dropna(axis=0,inplace=True) # drop null values
df3.insert(3,'idx',pd.RangeIndex(stop=4939),True)
df3.set_index('idx',inplace=True)

In [57]:
len(df3.index)

4939

In [174]:
def gen_word_encodings(df,word_freq):
    num_recs=len(df.index)
    max_len=0
    for i in df.index:
        words=df['text'][i].strip().split()
        if(len(words)>max_len):
            max_len=len(words)
        for word in words:
            word_freq[word]+=1
    return max_len

In [175]:
word_freq=collections.Counter()
max_len=gen_word_encodings(df3,word_freq) #these 2steps are important to properly load the data 
#word to index maps as{"word1":idx1,"word2":idx2...}
word2idx={x[0]:i+2 for i,x in enumerate(word_freq.most_common(len(word_freq)))}
word2idx["PAD"]=0
word2idx["UNK"]=1
#idx to word mapping
idx2word={v:k for k,v in word2idx.items()}

In [176]:
max_len

66

In [127]:
def gen_char_encodings(word_freq,char_freq):
    max_word_len=0
    for keys in word_freq:
        if(len(keys)>max_word_len):
            max_word_len=len(keys)
        chars=list(keys)
        for char in chars:
            char_freq[char]+=1
    return max_word_len

In [61]:
char_freq=collections.Counter()
max_word_len=gen_char_encodings(word_freq,char_freq)
char2idx={x[0]:i+2 for i,x in enumerate(char_freq.most_common(len(char_freq)))}
char2idx["PAD"]=0
char2idx["UNK"]=1
#idx to char mapping
idx2char={v:k for k,v in char2idx.items()}

In [None]:
char2idx

In [173]:
# remove emojis
for i in df3.index:
    df3.loc[i,'text']=df3.loc[i,'text'].translate({ord(j): None for j in ('🤗','😞','💪','😀','\xad','😭','🙌','\x90','😧','🙏','\x81','™','\x8f','😡','\x8d','🦂','🤣', '🤔', '🦁', '🤦', '🤝', '🤗', '🤩', '🤪', '🥭', '🤫', '🤘', '🤬', '🤙', '🥳', '🤨', '🧐', '🥰', '🥇', '🥶', '🥊', '🤛', '🤞', '🤕', '🤭', '🤟', '🤐', '🤺', '🧡', '🦸', '⏮', '⏸', '⏭', '🤧', '\U0001f7e0','🦍', '🧟', '🥁', '🤠', '🦌', '🦄', '🤓', '🧨', '🤮', '⏰', '🦅', '\u2066', '\u2069', '\u200b', '\u200d', '\u200c','\U000fe4eb')})

In [128]:
# function to generate dataset
def create_dataset(df,x,y):
    for i in df.index:
        words=df['text'][i].strip().split()
        seqs=[]
        for word in words:
            if word in word2idx:
                seqs.append(word2idx[word])
            else:
                seqs.append(word2idx["UNK"])
        x[i]=seqs
        cat=df['category'][i]
        if(cat=='OFF'):
            y[i]=1
        else:
            y[i]=0

In [129]:
num_recs=len(df3.index)
x=np.empty((num_recs,),dtype=list) # x holds sentence vectors 
y=np.zeros((num_recs,),dtype="uint8") # y holds category 1 for Offensive 0 for Not-Offensive
create_dataset(df3,x,y)
x=tf.keras.preprocessing.sequence.pad_sequences(x,maxlen=128)

In [132]:
vocab_size=len(word2idx)
forward_layer = tf.keras.layers.LSTM(32,dropout=0.25,recurrent_dropout=0.1,return_sequences=False)
backward_layer = tf.keras.layers.LSTM(32,activation='relu',dropout=0.25,recurrent_dropout=0.1,return_sequences=False,go_backwards=True)
model=tf.keras.models.Sequential([tf.keras.layers.Embedding(vocab_size,128,input_length=128),
                                tf.keras.layers.Bidirectional(forward_layer, backward_layer=backward_layer,merge_mode='concat'),
                                tf.keras.layers.Dense(1,activation="sigmoid")])

In [133]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
hist=model.fit(x,y,batch_size=32,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [134]:
model.save('task2_mal.h5')

In [177]:
model=tf.keras.models.load_model('task2_tamil.h5')

In [178]:
# load test set for prediction
df_preds=pd.read_csv('OffensiveLanguage/Task2/hasoc_tamil_task2_withoutlabels.tsv',sep='\t',names=['id','text','category'])
df_preds=df_preds.astype({'id':'string','text':'string','category':'string'})

In [5]:
# load test set for prediction
df_preds=pd.read_excel('Task2/mal-offensive-withoutlabels.xlsx',engine='openpyxl')
df_preds=df_preds.astype({'ID':'string','Tweets':'string'})
df_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      999 non-null    string
 1   Tweets  999 non-null    string
dtypes: string(2)
memory usage: 15.7 KB


In [8]:
df_preds.loc[1,'Tweets']

'USER cheruparamadathil than thinnunnath alla pinarayi thinnunnath pinarayikk oru barber venam mudi kalayan jacob thomas vannal aa joli ayale elpikkum'

In [142]:
#insert category column to add predictions
df_preds.insert(2,'category','NA',True)

In [179]:
for i in df_preds.index:
    words=df_preds.loc[i,'text'].strip().split()
    seqs=[]
    for word in words:
        if word in word2idx:
            seqs.append(word2idx[word])
        else:
            seqs.append(word2idx["UNK"])
    seqs=[seqs]
    seqs=tf.keras.preprocessing.sequence.pad_sequences(seqs,maxlen=50)
    if(model(seqs)[0][0]>0.40):
        preds="OFF"
    else:
        preds="NOT"
    df_preds.loc[i,'category']=preds

In [180]:
df_preds.head()
# save predictions
df_preds.to_csv('OffensiveLanguage/Task2/task2_tamil.tsv',sep="\t",index=False)