<a href="https://colab.research.google.com/github/ask-1710/FIRE2021-OffensiveLanguageDetection/blob/main/HASOC_FIRE_2021_BAYEs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install transformers
!pip install demoji
!pip install nltk
import numpy as np
import pandas as pd

import copy
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score 
from tqdm import tqdm
import demoji
import nltk
import string
import pickle
import math
import numpy as np
import sys
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

demoji.download_codes() 
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams.update({'font.size': 16})
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 30.2 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.18-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 46.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 33.9 MB/s 
Collecting ruamel.yaml==0.17.16
  Downloading ruamel.yaml-0.17.16-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 64.4 MB/s 
Collecting 



In [2]:

class Tokenizer():
    def __init__(self):
        self.index = {}
        self.tf_idf_index = {}
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))

    def remove_punc(self, text):
        return ''.join([ch for ch in text if str(ch).isalpha() or ch == ' '])
    
    def remove_stop(self, text):
        return ' '.join([word for word in text.lower().split() if word not in self.stopwords])
    
    def get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    def lemmatize(self, text):
        # return [self.wordnet_lemmatizer.lemmatize(w, self.get_wordnet_pos(w)) for w in nltk.word_tokenize(text)]
        return [self.wordnet_lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]

    def build_index(self, article_id, tokenized):
        for (idx, token) in enumerate(tokenized):
            if token not in self.index.keys():
                self.index[token] = {}
            if article_id not in self.index[token].keys():
                self.index[token][article_id] = []
            self.index[token][article_id].append(idx+1)

class Dataset():
    def __init__(self, train_data, val_data, tokenizer, batch_size = 32):
        # self.train_data = train_data
        # self.val_data = val_data
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.label_dict = {'OFF':0,'NOT':1}
        
        self.sentences_train = []
        self.sentences_test = []

        self.y_train = []
        self.y_test = []

        self.process_train(train_data)
        self.process_test(val_data)
        
        vectorizer = CountVectorizer()
        self.vec = vectorizer.fit(self.sentences_train)

        self.X_train = self.vec.transform(self.sentences_train)
        self.X_test = self.vec.transform(self.sentences_test)

    def process_train(self, data):  
        tokens = []

        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_train.append(' '.join(tokenized))
            self.y_train.append(label)
        
    def process_test(self, data):
        tokens = []
        
        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_test.append(' '.join(tokenized))
            self.y_test.append(label)


In [4]:
data_train = pd.read_excel('/content/drive/MyDrive/Hasdoc/Malayalam_offensive_data_Training-YT.xlsx', names=["Name","Text","result"], index_col=False)
data_train.to_csv('/content/drive/MyDrive/Hasdoc/BUFFER.csv',sep='\t',header = False)

tokenizer = Tokenizer()
with open('/content/drive/MyDrive/Hasdoc/BUFFER.csv', 'r',) as f:
    train_data =f.readlines() 
print(train_data)
with open('/content/drive/MyDrive/Hasdoc/malayalam_hasoc_tanglish_test_withlabels.tsv', 'r') as f:
    val_data = f.readlines()
data = Dataset(train_data, val_data, tokenizer)

#Bayes
mult_bayes_results = {}
ber_bayes_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)
K = [1000, 5000, X_train.shape[0]]
print(y_train[:2])
print(X_train[:2])
print(y_train.shape)
print(X_train.shape)
for k in K:
    X = SelectKBest(mutual_info_classif,k=k).fit(X_train,y_train)
    X_train_new = X.transform(X_train)
    X_test_new = X.transform(X_test)
    print(f'Running Bayes Models on k = {k}............')
    # best_feature_idxs = data.best_features[:k]
    # X_train_new = X_train
    # X_test_new = X_test
    
    clf = MultinomialNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    mult_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')
    
    clf = BernoulliNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    ber_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')
    print('Done')

print(mult_bayes_results)
print(ber_bayes_results)

['0\tMA_YT001\tThaankal enthaan cheyyarullath?😛\tNOT\n', '1\tMA_YT002\tEe theetam WCC feminichigalude news aarkk venam...kondupode...\tOFF\n', '2\tMA_YT003\tfukru nem tiktok oolakale vilich charcha nadathiyenekka..bedham aanh\tOFF\n', '3\tMA_YT004\tAashiq abu produce cheytharunnel ee problems undakillarunnu.....\tNOT\n', '4\tMA_YT005\tPennungal oru team aayal ath moonjum ennu epoo mansilayallo\tOFF\n', '5\tMA_YT006\t@USER po oru Rajithakku parayam ennal oru Renjithinu parayan pattila. Karanam athu appol purushadhipathyam ayi pokum\tNOT\n', '6\tMA_YT007\t@USER Ath kazhinju ulla sentence vayik\tNOT\n', '7\tMA_YT008\t@USER TL vere oru ss kandu.\tNOT\n', '8\tMA_YT009\t@USER penninte peru paranj ,,,,soyam bodham polumillathe enthakeyo kaati kootunna kudumbathi pirakaatha poya kure alavalatjikal und ,athil oru teams aanu ivalumar,,,,,neeyum angne aano mole\tOFF\n', '9\tMA_YT010\t@USER ninte chanthik ente vaka orayiram umma\tOFF\n', '10\tMA_YT011\t@USER aaano arinjilla..pavam femichees😣😣\tOFF