# MAIN MODEL FOR PART A

In [341]:
#loading packages

import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm 
import codecs
import random
import pandas as pd 
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn import model_selection

[nltk_data] Downloading package stopwords to /Users/mobby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [342]:
#1. VISUALIZING DATA 

In [343]:
df = pd.read_csv('offenseval-training-v1.tsv',sep='\t', header=0, names = ['ID','Tweet','Task_A','Task_B','Task_C'])
df_test = pd.read_csv('offenseval-trial.txt',sep="\t", header=0, names = ['Tweet','Task_A','Task_B','Task_C'])

In [344]:
df.head(5)
df_test.head(5)

Unnamed: 0,Tweet,Task_A,Task_B,Task_C
0,@LeftyGlenn @jaredeker @BookUniverse @hashtagz...,NOT,,
1,Hot Mom Sucks Off Step Son In Shower 8 min htt...,OFF,UNT,
2,bro these are some cute butt plugs I’m trying ...,OFF,UNT,
3,Arizona Supreme Court strikes down state legis...,NOT,,
4,Arguing gun control is wrong of me whoever has...,NOT,,


# PRE-PROCESSING

In [345]:
#Pre-process functions 

def remove_emoji(sentence):
    
    #processed = sentence.decode('utf-8')
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    processed = emoji_pattern.sub(r'', sentence) # no emoji
    
    return(processed)

def replace_users(sentence):
    
    processed = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)','@USER', sentence)
    return(processed)

def replace_url(sentence):
    
    processed = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','website', sentence)
    return(processed)
    

def remove_white(sentence):
    
    processed = sentence.replace(r'[^\w\d\s]',' ') #replace punctuation by space
    processed = processed.replace(r'\s+', ' ') #replace whitespaces with a single space
    processed = processed.replace(r'^\s+|\s+?$','') #replace leading and trailing whitespace
    processed = processed.replace('\n','')

    return(processed)


def remove_stop(sentence): #remove stop words
    
    stop_words = set(stopwords.words('english'))
    processed = sentence.apply(lambda x: ' '.join(term for term in x.split() is term not in stop_words))
    
    return(processed)

def remove_stems(sentence):
    
    ps = nltk.PorterStemmer()
    
    processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
    
    return(processed)

def preproc_dset(corpus):
    
    new_corpus = []
    
    for sentence in corpus :
        
        processed = remove_white(sentence)
        processed = remove_stop(processed)
        processed = remove_stems(processed)
        new_corpus.append(processed)
        
    return(new_corpus)


def remove_char(sentence):
    
    chars = ['.',',',':','!','?','%',')','(',';','[',']','{','}','$','@','#','=','^','*','$','/']
    for char in chars:
        sentence= sentence.replace(char,'')
        
        
    return(sentence)

def lower(sentence):
    
    sentence = sentence.lower()
    return(sentence)
        

In [346]:
def get_tokenized_corpus(corpus):
    
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    
    for sentence in corpus:
        
        tokenized_sentence = []
        for token in sentence.split(' '): # simplest split is 
            tokenized_sentence.append(token)
           
        tokenized_corpus.append(tokenized_sentence)
        
    return tokenized_corpus 

In [347]:
corpus = df['Tweet'].tolist()
labels = df['Task_A'].tolist()

#corpus = df['Tweet']
#labels = df['Task_A']

test_corpus = df_test['Tweet'].tolist()
test_labels = df_test['Task_A'].tolist()

print("corpus size :", len(corpus))
print("labels size :", len(labels))

print("corpus size :", len(test_corpus))
print("labels size :", len(test_labels))

corpus size : 13240
labels size : 13240
corpus size : 319
labels size : 319


In [348]:
#convert class labels to binary values 
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder_test = LabelEncoder()

Y = encoder.fit_transform(labels)
Y_test = encoder_test.fit_transform(test_labels)

print(Y[:10])
print(labels[:10])

[1 1 0 1 0 1 1 1 0 1]
['OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'OFF', 'NOT', 'OFF']


In [349]:
##### PROCESS TRAINING DATA 

stop_words = set(stopwords.words('english'))

processed_corpus = []

for sentence in corpus :
    processed = remove_emoji(sentence)
    processed = remove_white(processed) #remove punctuation + white space
    processed = remove_char(processed)
    processed = lower(processed)
    processed_corpus.append(processed)

tokenized_corpus = get_tokenized_corpus(processed_corpus) #tokenize corpus

print('size of sentence before removing stopwords :',len(tokenized_corpus[0]))

filtered_corpus = []
for sentence in tokenized_corpus :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus.append(new_sentence)
    
    
print('size of sentence after removing stopwords',len(filtered_corpus[0]))


ps = nltk.PorterStemmer()
stemmed_corpus = []
for sentence in filtered_corpus :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus.append(new_sentence)

#print('corpus insight after pre-processing :',stemmed_corpus[:3])

size of sentence before removing stopwords : 14
size of sentence after removing stopwords 5


In [350]:
######## PROCESS TEST DATA

stop_words = set(stopwords.words('english'))

test_processed_corpus = []

for sentence in test_corpus :
    processed = remove_emoji(sentence)
    processed = replace_users(processed)
    processed = replace_url(processed)
    processed = remove_white(processed) #remove punctuation + white space
    processed = remove_char(processed)
    processed = lower(processed)
    test_processed_corpus.append(processed)

test_tokenized_corpus = get_tokenized_corpus(test_processed_corpus) #tokenize corpus

print('size of sentence before removing stopwords :',len(test_tokenized_corpus[0]))

test_filtered_corpus = []
for sentence in test_tokenized_corpus :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    test_filtered_corpus.append(new_sentence)
    
print('size of corpus :', len(filtered_corpus))
    
print('size of sentence after removing stopwords',len(test_filtered_corpus[0]))


ps = nltk.PorterStemmer()
test_stemmed_corpus = []
for sentence in test_filtered_corpus :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    test_stemmed_corpus.append(new_sentence)

#print('\ncorpus insight after pre-processing :',test_stemmed_corpus[:20])
#print('\ncorpus before any pre-processing :', test_corpus[:3])

size of sentence before removing stopwords : 41
size of corpus : 13240
size of sentence after removing stopwords 23


In [351]:
all_words = []

for sentence in stemmed_corpus :
    for word in sentence :
        all_words.append(word)
        
all_words = nltk.FreqDist(all_words)

print('Number of words :',len(all_words))
print('Most common words :', all_words.most_common(10))

Number of words : 18370
Most common words : [('user', 33386), ('url', 2054), ('liber', 1576), ('gun', 1503), ('like', 1186), ('control', 1183), ('antifa', 1112), ('conserv', 1006), ('maga', 992), ('get', 902)]


In [352]:
word_features = list(all_words.keys())[:1500] #10k words as features -------> HYPER-PARAMETER

In [353]:
dset = zip(stemmed_corpus,Y)
d = list(dset)

test_dset = zip(test_stemmed_corpus,Y_test)
test_d = list(test_dset)

seed = 1
np.random.seed = seed
np.random.shuffle(d)

np.random.shuffle(test_d)

print(d[0])
print(test_d[:10])

(['user', 'northern', 'az', 'good', 'look', 'forward', 'see', 'maga'], 0)
[(['user', "he'", 'use', 'administr', 'sell', 'uranium', 'profit', 'spi', 'polit', 'oppon', 'bow', 'enemi', 'probabl', 'best', 'draintheswamp'], 0), (['oncologist', 'discuss', 'use', 'medic', 'marijuana', 'patient', 'new', 'studi', 'find', 'websit', 'websit'], 0), (['user', 'gemini’', 'fav', 'fuck', 'hater', 'tbh'], 1), (['guess', 'ima', 'beat', 'lil', 'kid', 'ass', 'today', 'caus', 'lil', 'becki', 'kid', 'get', 'away', 'websit'], 1), (['user', 'user', 'user', 'user', 'user', 'user', 'found', 'insect', 'egg', '“medic', 'marijuana”', 'curaleaf', 'cost', '55', '18', 'socialjustic', 'websit'], 0), (['user', 'user', 'user', 'user', 'user', 'user', 'fine', 'could', 'afford', 'gun', 'want', 'could', 'fit', 'budget', 'budget', 'fine', 'canada', 'gun', 'insur', 'gun', 'control', 'lotsa', 'p'], 0), (['im', 'think', 'read', 'cathedral"', 'back', 'ap', 'british', 'lit', '"the', 'a&ampp"', 'first', 'year', 'colleg', 'fuckin'

In [354]:
def find_features(sentence):
    
    features = {}
    for word in word_features:
        features[word] = (word in sentence)
        
    return (features)

In [355]:
f = find_features(stemmed_corpus[0])

print(stemmed_corpus[0])
print(f)

['user', 'ask', 'nativ', 'american', 'take']
{'user': True, 'ask': True, 'nativ': True, 'american': True, 'take': True, 'go': False, 'home': False, 'you’r': False, 'drunk': False, 'maga': False, 'trump2020': False, 'url': False, 'amazon': False, 'investig': False, 'chines': False, 'employe': False, 'sell': False, 'intern': False, 'data': False, 'third-parti': False, 'seller': False, 'look': False, 'edg': False, 'competit': False, 'marketplac': False, 'kag': False, 'china': False, 'tcot': False, 'someon': False, 'should\'vetaken"': False, 'piec': False, 'shit': False, 'volcano': False, '"': False, 'obama': False, 'want': False, 'liber': False, '&amp': False, 'illeg': False, 'move': False, 'red': False, 'state': False, 'kookoo': False, 'oh': False, 'noe': False, 'tough': False, 'liter': False, 'talk': False, 'lol': False, 'mass': False, 'shoot': False, 'like': False, 'set': False, 'up': False, 'it’': False, 'propaganda': False, 'use': False, 'divid': False, 'us': False, 'major': False, '

# SPLIT DATA SET 

In [356]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC

features_set = [(find_features(sentence),label) for (sentence,label) in d]
test_features_set = [(find_features(sentence),label) for (sentence,label) in test_d]

#print(features_set[0])

In [357]:
print('feature set size :', np.shape(features_set))

training, testing = model_selection.train_test_split(features_set, test_size = 0.25, random_state = seed)

print('training set size :',np.shape(training))
print('test set size :', np.shape(testing))

feature set size : (13240, 2)
training set size : (9930, 2)
test set size : (3310, 2)


# Training models

In [358]:
## DEFINE SEVERAL MODELS

names = ['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100), 
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names,classifiers)
m = list(models)

In [360]:
## TEST ON TESTINT SET

from sklearn.ensemble import VotingClassifier

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = m, voting ='hard', n_jobs =-1))
nltk_ensemble.train(features_set)
acc2 = nltk.classify.accuracy(nltk_ensemble, test_features_set) *100 
print('Ensemble accuracy :',acc2)

Ensemble accuracy : 84.63949843260188


# Compute predictions for model

In [361]:
predictions = []
true_preds = []

print(np.shape(test_features_set))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = m, voting ='hard', n_jobs =-1))
nltk_ensemble.train(features_set)

error = 0

for (features, label) in test_features_set:
    pred = nltk_ensemble.classify(features)
    predictions.append(pred)
    true_preds.append(label)
    
    if pred != label:
        error += 1
    
#print(len(predictions))
#print((len(predictions) - error)/len(predictions))

(319, 2)


In [362]:
from nltk.metrics import ConfusionMatrix
from sklearn.metrics import f1_score


cm = ConfusionMatrix(true_preds, predictions)
print(cm)

  |   0   1 |
--+---------+
0 |<226> 16 |
1 |  31 <46>|
--+---------+
(row = reference; col = test)



In [363]:
print(predictions[:10])
print(Y_test[:10])

print(type(Y_test))
print(type(predictions))

#list(Y_test)

[0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
[0 1 1 0 0 0 0 1 1 0]
<class 'numpy.ndarray'>
<class 'list'>


# Compute F1 score and Confusion matrix 

In [364]:
def confusion_matrix(y_true, y_pred):
    conf_matrix = np.zeros((2,2))
    for i in range(len(y_true)):
        conf_matrix[y_true[i],y_pred[i]] += 1
    return conf_matrix

In [365]:
def compute_recall_precision_f1(confusion_matrix):
    true_pred = 0
    total_pred = 0
    for i in range(len(confusion_matrix)):
        true_pos = confusion_matrix[i][i]
        true_pred += true_pos
        total_pred += sum(confusion_matrix[i])
        false_neg = 0
        false_pos = 0
        for j in range(len(confusion_matrix)):
            if j!=i:
                false_neg += confusion_matrix[i][j]
                false_pos += confusion_matrix[j][i]
        recall = true_pos / (true_pos + false_neg)
        precision = true_pos / (true_pos + false_pos)
        f1 = 2*(precision * recall)/(precision + recall)
        print("==========================")
        print("For class ", i, " : ")
        print()
        print("Precision : ", precision)
        print("Recall : ", recall)
        print("F1 : ", f1)
        print("==========================")
    print("Classification Rate : ", true_pred/total_pred)

In [366]:
conf_mat = confusion_matrix(true_preds,predictions)
print(conf_mat)

[[226.  16.]
 [ 31.  46.]]


In [367]:
compute_recall_precision_f1(conf_mat)

For class  0  : 

Precision :  0.8793774319066148
Recall :  0.9338842975206612
F1 :  0.905811623246493
For class  1  : 

Precision :  0.7419354838709677
Recall :  0.5974025974025974
F1 :  0.6618705035971223
Classification Rate :  0.8526645768025078


# COMPUTE RESULTS CODATLAB

In [368]:
data_submission = pd.read_csv('testset-taska.tsv', sep='\t')

In [372]:
submit_corpus = data_submission['tweet'].tolist()

print("corpus size :", len(submit_corpus))

corpus size : 860


In [373]:
######## PROCESS TEST DATA

stop_words = set(stopwords.words('english'))

submit_processed_corpus = []

for sentence in submit_corpus :
    processed = remove_emoji(sentence)
    processed = replace_users(processed)
    processed = replace_url(processed)
    processed = remove_white(processed) #remove punctuation + white space
    processed = remove_char(processed)
    processed = lower(processed)
    submit_processed_corpus.append(processed)

submit_tokenized_corpus = get_tokenized_corpus(submit_processed_corpus) #tokenize corpus

print('size of sentence before removing stopwords :',len(submit_tokenized_corpus[0]))

submit_filtered_corpus = []
for sentence in submit_tokenized_corpus :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    submit_filtered_corpus.append(new_sentence)
    
print('size of corpus :', len(submit_filtered_corpus))
    
print('size of sentence after removing stopwords',len(submit_filtered_corpus[0]))


ps = nltk.PorterStemmer()
submit_stemmed_corpus = []
for sentence in submit_filtered_corpus :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    submit_stemmed_corpus.append(new_sentence)

#print('\ncorpus insight after pre-processing :',test_stemmed_corpus[:20])
#print('\ncorpus before any pre-processing :', test_corpus[:3])

size of sentence before removing stopwords : 27
size of corpus : 860
size of sentence after removing stopwords 26


In [374]:
type(submit_stemmed_corpus)

list

In [375]:
submit_features_set = [find_features(sentence) for sentence in submit_stemmed_corpus]

In [376]:
predictions = []
true_preds = []

error = 0

for feature in submit_features_set:
    pred = nltk_ensemble.classify(feature)
    predictions.append(pred)

In [377]:
len(predictions)

860

In [379]:
data_submission['prediction'] = predictions

In [383]:
dico_target = {0:'NOT', 1:'OFF'}
datalength = data_submission.shape[0]
for i in range(datalength):
    data_submission.at[i, 'target'] = dico_target[data_submission.at[i, 'prediction']]

In [385]:
data_submission = data_submission.drop(['prediction'], axis=1)
data_submission = data_submission.drop(['tweet'], axis=1)

In [388]:
data_submission.to_csv('subpart_a_submit.csv', index=False, header=False)

In [369]:
#######################################################################################################################
#######################################################################################################################
#############################################################################################################################BITE##
#######################################################################################################################