# ALTERNATIVE MODEL FOR PART A

In [1]:
#loading packages

import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm 
import codecs
import random
import pandas as pd 
import re
import gensim

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn import model_selection

[nltk_data] Downloading package stopwords to /Users/mobby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#1. VISUALIZING DATA 

In [3]:
df = pd.read_csv('offenseval-training-v1.tsv',sep='\t', header=0, names = ['ID','Tweet','Task_A','Task_B','Task_C'])
df_test = pd.read_csv('offenseval-trial.txt',sep="\t", header=0, names = ['Tweet','Task_A','Task_B','Task_C'])

In [42]:
df.head(5)
#df_test.head(5)

Unnamed: 0,ID,Tweet,Task_A,Task_B,Task_C
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [5]:
#Pre-process functions 

def remove_emoji(sentence):
    
    #processed = sentence.decode('utf-8')
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    processed = emoji_pattern.sub(r'', sentence) # no emoji
    
    return(processed)

def replace_users(sentence):
    
    processed = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)','@USER', sentence)
    return(processed)

def replace_url(sentence):
    
    processed = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','website', sentence)
    return(processed)
    

def remove_white(sentence):
    
    processed = sentence.replace(r'[^\w\d\s]',' ') #replace punctuation by space
    processed = processed.replace(r'\s+', ' ') #replace whitespaces with a single space
    processed = processed.replace(r'^\s+|\s+?$','') #replace leading and trailing whitespace
    processed = processed.replace('\n','')

    return(processed)


def remove_stop(sentence): #remove stop words
    
    stop_words = set(stopwords.words('english'))
    processed = sentence.apply(lambda x: ' '.join(term for term in x.split() is term not in stop_words))
    
    return(processed)

def remove_stems(sentence):
    
    ps = nltk.PorterStemmer()
    
    processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
    
    return(processed)

def preproc_dset(corpus):
    
    new_corpus = []
    
    for sentence in corpus :
        
        processed = remove_white(sentence)
        processed = remove_stop(processed)
        processed = remove_stems(processed)
        new_corpus.append(processed)
        
    return(new_corpus)


def remove_char(sentence):
    
    chars = ['.',',',':','!','?','%',')','(',';','[',']','{','}','$','@','#','=','^','*','$','/']
    for char in chars:
        sentence= sentence.replace(char,'')
        
        
    return(sentence)

def lower(sentence):
    
    sentence = sentence.lower()
    return(sentence)
        

In [6]:
def get_tokenized_corpus(corpus):
    
    tokenized_corpus = [] # Let us put the tokenized corpus in a list
    
    for sentence in corpus:
        
        tokenized_sentence = []
        for token in sentence.split(' '): # simplest split is 
            tokenized_sentence.append(token)
           
        tokenized_corpus.append(tokenized_sentence)
        
    return tokenized_corpus 

In [7]:
corpus = df['Tweet'].tolist()
labels = df['Task_A'].tolist()

#corpus = df['Tweet']
#labels = df['Task_A']

test_corpus = df_test['Tweet'].tolist()
test_labels = df_test['Task_A'].tolist()

print("corpus size :", len(corpus))
print("labels size :", len(labels))

print("corpus size :", len(test_corpus))
print("labels size :", len(test_labels))

corpus size : 13240
labels size : 13240
corpus size : 319
labels size : 319


In [8]:
#convert class labels to binary values 
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder_test = LabelEncoder()

Y = encoder.fit_transform(labels)
Y_test = encoder_test.fit_transform(test_labels)

print(Y[:10])
print(labels[:10])

[1 1 0 1 0 1 1 1 0 1]
['OFF', 'OFF', 'NOT', 'OFF', 'NOT', 'OFF', 'OFF', 'OFF', 'NOT', 'OFF']


In [9]:
##### PROCESS TRAINING DATA 

stop_words = set(stopwords.words('english'))

processed_corpus = []

for sentence in corpus :
    processed = remove_emoji(sentence)
    processed = remove_white(processed) #remove punctuation + white space
    processed = remove_char(processed)
    processed = lower(processed)
    processed_corpus.append(processed)

tokenized_corpus = get_tokenized_corpus(processed_corpus) #tokenize corpus

print('size of sentence before removing stopwords :',len(tokenized_corpus[0]))

filtered_corpus = []
for sentence in tokenized_corpus :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus.append(new_sentence)
    
    
print('size of sentence after removing stopwords',len(filtered_corpus[0]))


ps = nltk.PorterStemmer()
stemmed_corpus = []
for sentence in filtered_corpus :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus.append(new_sentence)

#print('corpus insight after pre-processing :',stemmed_corpus[:3])

size of sentence before removing stopwords : 14
size of sentence after removing stopwords 5


In [10]:
######## PROCESS TEST DATA

stop_words = set(stopwords.words('english'))

test_processed_corpus = []

for sentence in test_corpus :
    processed = remove_emoji(sentence)
    processed = replace_users(processed)
    processed = replace_url(processed)
    processed = remove_white(processed) #remove punctuation + white space
    processed = remove_char(processed)
    processed = lower(processed)
    test_processed_corpus.append(processed)

test_tokenized_corpus = get_tokenized_corpus(test_processed_corpus) #tokenize corpus

print('size of sentence before removing stopwords :',len(test_tokenized_corpus[0]))

test_filtered_corpus = []
for sentence in test_tokenized_corpus :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    test_filtered_corpus.append(new_sentence)
    
print('size of corpus :', len(filtered_corpus))
    
print('size of sentence after removing stopwords',len(test_filtered_corpus[0]))


ps = nltk.PorterStemmer()
test_stemmed_corpus = []
for sentence in test_filtered_corpus :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    test_stemmed_corpus.append(new_sentence)

#print('\ncorpus insight after pre-processing :',test_stemmed_corpus[:20])
#print('\ncorpus before any pre-processing :', test_corpus[:3])

size of sentence before removing stopwords : 41
size of corpus : 13240
size of sentence after removing stopwords 23


# MODEL FOR DOC2VEC

In [24]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

dset = zip(stemmed_corpus,Y)
d = list(dset)

tagged_data = [TaggedDocument(words=sentence, tags=[str(i)]) for i, sentence in enumerate(stemmed_corpus)]

print(len(tagged_data))

#print(tagged_data[:20])

13240


# DOC2VEC TRAINING

In [25]:
max_epochs = 100
vec_size = 50
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha



iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [30]:
model.docvecs.most_similar('0')

[('7598', 0.7240195274353027),
 ('1569', 0.7178003191947937),
 ('12845', 0.7090256810188293),
 ('6559', 0.6986104846000671),
 ('3365', 0.6932913661003113),
 ('13108', 0.6818808913230896),
 ('3648', 0.6804354786872864),
 ('11225', 0.6674841046333313),
 ('11788', 0.6663023829460144),
 ('3045', 0.6596972942352295)]

In [94]:
X = model.docvecs[0].reshape(1,-1)

for i in range(1,len(stemmed_corpus)):
    X = np.concatenate((X,model.docvecs[i].reshape(1,-1)),axis=0)
    
print(np.shape(X))

(13240, 50)


In [112]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
#X_train = X.reshape(-1,1)
#print(np.shape(X_train))
#Y_train = Y.reshape(-1,1)
X_train, Y_train = ros.fit_resample(X, Y.reshape(-1,))

print(np.shape(X_train))
print(np.shape(Y_train))

(17680, 50)
(17680,)


# CLASSIFICATION MODEL

In [113]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(842, 86), random_state=1)

clf.fit(X_train,Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(842, 86), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [122]:
from sklearn.ensemble import RandomForestClassifier



gnb = RandomForestClassifier()

gnb.fit(X_train, Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# MAKE PREDICTIONS

In [114]:
#compute Doc2Vec for test corpus
test_vec = []

for sentence in test_stemmed_corpus :
    
    labels = []
    probas = 0
    test_vec.append(model.infer_vector(sentence))
    
print(len(test_vec))

319


In [115]:
#make prediction for test vectors
Y_pred = []

for vec in test_vec :
    
    Y_pred.append(clf.predict(vec.reshape(1,-1))[0])

print(len(Y_pred))

319


In [123]:
Y_pred2 = []

for vec in test_vec :
    
    Y_pred2.append(gnb.predict(vec.reshape(1,-1))[0])

print(len(Y_pred2))

319


# RESULTS

In [116]:
conf_mat = confusion_matrix(Y_test,Y_pred)
print(conf_mat)

[[242.   0.]
 [ 77.   0.]]


In [126]:
compute_recall_precision_f1(conf_mat)

For class  0  : 

Precision :  0.7586206896551724
Recall :  1.0
F1 :  0.8627450980392156
For class  1  : 

Precision :  nan
Recall :  0.0
F1 :  nan
Classification Rate :  0.7586206896551724


  from ipykernel import kernelapp as app


In [124]:
conf_mat2 = confusion_matrix(Y_test,Y_pred2)
print(conf_mat2)

[[188.  54.]
 [ 57.  20.]]


In [125]:
compute_recall_precision_f1(conf_mat2)

For class  0  : 

Precision :  0.7673469387755102
Recall :  0.7768595041322314
F1 :  0.7720739219712526
For class  1  : 

Precision :  0.2702702702702703
Recall :  0.2597402597402597
F1 :  0.26490066225165565
Classification Rate :  0.6520376175548589
