## Import Libraries

In [0]:
# Install packages
!pip install xlrd
!pip install gensim
!pip install -U nltk
!pip install spacy

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/8d/5d/825889810b85c303c8559a3fd74d451d80cf3585a851f2103e69576bf583/nltk-3.4.3.zip (1.4MB)
[K     |████████████████████████████████| 1.5MB 30.3MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/54/40/b7/c56ad418e6cd4d9e1e594b5e138d1ca6eec11a6ee3d464e5bb
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.4.3


## Access to GDrive

# Data Cleaning and Preprocessing

In [0]:
# Load stop words and more (will be added later)
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Cleaning dataset functions

In [0]:
# Cleaning data
import unicodedata
import re
import gensim
from nltk import tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

del stop_words[stop_words.index('not')]
del stop_words[stop_words.index('your')]

# Remove unwanted noise
stop_words.append('rt')
stop_words.append('wow')
stop_words.append('ok')
stop_words.append('mo')
stop_words.append('dm')
stop_words.append('idgaf')

CONTRACTION_MAP = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "hehe will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "hey would",
"they'd've": "they would have",
"they'll": "tthey will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
"fvcking": "fucking",
"fking": "fucking",
# Correction
    # "ill": "I will",
    "seriois": "serious",
    #"mo": "my",
    #"ass": "asss",
    "lmaoooooooo": "lmao",
    "uncomf": "uncomfortable",
    "pls": "please",
    "lowlife": "low life",
    "puss": "pussy",
    # Hashtash expand
    "ihatefemales": "I hate females",
    "yesallwomen": "yes all women",
    "sendthemhome": "send them home",
    "stoptheinvasion": "stop the invation",
    "buildthewall": "build the wall",
    "womensuck": "women suck",
    "stopimmigration": "stop immigration",
    "sendthemback": "send them back"
}

# Expand the contractions
def expand_contractions(s, contractions_dict=CONTRACTION_MAP):
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

# Sentences to words
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
# tokenize
def sent_tokenize(sents):
    return tokenize.sent_tokenize(sents)
  
# Function to remove some unwanted info
def remove_info(data, remove_list):
    for i, sent in enumerate(data):
        for com in remove_list:
            if com.strip() != '':
                sent = sent.lower().replace(com.lower(), '')
        data[i] = sent
    return data

# Remove special characters; only get ASCII
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', str(input_str))
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    only_ascii = str(only_ascii)[2:-1]
    return only_ascii

# Remove emails, link, tweeter account, ..
def clean(texts):
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent.lower().strip()) for sent in texts]
    # Remove @username
    data = [re.sub('@\S*\s?', '', sent) for sent in data]
    # Remove link
    data = [re.sub(r'http\S+', '', sent) for sent in data]
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    # Remove multi dots
    data = [re.sub('\.\.', '', sent) for sent in data]
    # Remove distracting quotes
    # data = [re.sub("\"", "", sent) for sent in data]
    # Remove #
    data = [re.sub("#", "", sent) for sent in data]
    # Remove number
    data = [re.sub(r"\d+", '', sent) for sent in data]
    return data

# Func to remove stop words
def remove_stopwords(texts, stop_words=stop_words):
    return [' '.join([word.replace(' ', '') 
                for word in gensim.utils.simple_preprocess(str(doc)) \
                if word.replace(' ', '') not in stop_words]) for doc in texts]

# Func to lemmatize words using spacy
def lemmatization(texts, stop_words=None, 
                  allowed_postags=('NOUN', 'VERB', 'ADV')):
    nlp = en_core_web_md.load()
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc 
                            if token.pos_ in allowed_postags \
                              and token.lemma_ not in stop_words])
    return texts_out

# Func to lemmatize words using nltk
# Refer to https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
def lemmatization_nltk(sents):
    # input: sents -> list of sentences
    # output: list of lemmatized words
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lemmatized_words = []
    
    lemmatizer = WordNetLemmatizer()
    for sentence in sents:
        lemmatized_words.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) 
                                  for w in nltk.word_tokenize(sentence)]))
    return lemmatized_words

# Normalized pipeline
def normalization_pipeline(texts):
    normalized_texts = []
    # lower text
    normalized_texts = [text.lower() for text in texts]
    
    # Clean text: special words, numbers, ..
    normalized_texts = clean(normalized_texts)
    # Expand contraction 
    normalized_texts = [expand_contractions(text) for text in normalized_texts]
    normailzed_texts = [remove_accents(text) for text in normalized_texts]
    # Break paragraph into sentents
    normalized_texts = [sent_tokenize(text) for text in normalized_texts]
    
    # Remove stopwords
    normalized_texts = remove_stopwords(normalized_texts)
    
    normalized_texts = lemmatization_nltk(normalized_texts)
    return normalized_texts
    

In [0]:
# Read data from file
import pandas as pd

# Point to the file in Google Drive
filename='/content/gdrive/My Drive/trial_en.tsv'
df = pd.read_csv(filename, sep='\t')
corpus = df['text'].values

normalized_texts = normalization_pipeline(corpus)

df['normalized_text'] = normalized_texts
print(df.head())

   id  ...                                    normalized_text
0   1  ...                           shut fuck come suck dick
1   2  ...  fuck say leave block first gon na kick your as...
2   3  ...  cock get hard want pull your panty push door l...
3   4  ...  ill kill bitch chloe your not home kid bitch r...
4   5  ...         get rape beautiful woman like work project

[5 rows x 6 columns]


In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2,
                           norm='l2',
                           smooth_idf=True,
                           use_idf=True,
                           ngram_range=(1, 1))
features = vectorizer.fit_transform(corpus)
print(features.todense())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.26331868 0.18521249]
 [0.         0.         0.         ... 0.27264559 0.23875348 0.33586775]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


# Feature enginerring
Bow, TF-IDF and Word2Vec

In [0]:
# Feature extraction
# Use TF and TF-IDF from scikit-learn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd

def bow_extractor(corpus, ngram_range=(2,2)):
    vectorizer = CountVectorizer(min_df=2, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

def tfidf_extractor(corpus, ngram_range=(2,2)):
    vectorizer = TfidfVectorizer(min_df=2,
                               norm='l2',
                               smooth_idf=True,
                               use_idf=True,
                               ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
  
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                 smooth_idf=True,
                                 use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                    columns=feature_names)
    print(df)
  

In [0]:
# Demo Feature extraction with count vector and tf-idf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = CountVectorizer()
train_data_features = vectorizer.fit_transform(df['normalized_text'].values.copy())
vocab = vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features.toarray(), axis=0)

# Print Word Count
# for tag, count in zip(vocab, dist):
#    print(count, tag)

corpus = df['normalized_text'].values.copy()
bow_vect, bow_features = bow_extractor(corpus)
features = bow_features.todense()
feature_names = bow_vect.get_feature_names()
# display_features(features, feature_names)

# TF-IDF
tfidf_vectorizer, tdidf_features = tfidf_extractor(corpus.copy())
features = tdidf_features.todense()
feature_names = tfidf_vectorizer.get_feature_names()

#display_features(np.round(tdidf_features.todense(), 2), feature_names)

In [0]:
# word2vec feature extraction
from gensim.models import Word2Vec
import nltk
import numpy as np
np.random.seed(1)

# define function to average word vectors for a text document    
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

# generalize above function for a corpus of documents  
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, 
                                   vocabulary, num_features)
                  for tokenized_sentence in corpus]
    return np.array(features)

# define function to compute tfidf weighted averaged word vector for a document
def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)] 
                 if tfidf_vocabulary.get(word) else 0 for word in words]
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    feature_vector = np.zeros((num_features,),dtype="float32")
    vocabulary = set(model.wv.index2word)
    wts = 0.
    for word in words:
        if word in vocabulary:
            word_vector = model[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
        feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
    return feature_vector

# generalize above function for a corpus of documents    
def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors, tfidf_vocabulary, model, num_features):
    docs_tfidfs = [(doc, doc_tfidf)
                 for doc, doc_tfidf
                 in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,
                                 model, num_features)
                  for tokenized_sentence, tfidf in docs_tfidfs]
    return np.array(features)


corpus = df['normalized_text'].values.copy()
tokenized_corpus = [nltk.word_tokenize(sent) for sent in corpus]


model = Word2Vec(tokenized_corpus, min_count=2, workers=4)
avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_corpus,
                                                 model=model,
                                                 num_features=100)

print(np.round(avg_word_vec_features, 5))

W0608 11:31:23.912069 140007620867968 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


[[-1.92e-03  7.80e-04  2.00e-04 ... -1.05e-03  2.90e-04 -7.30e-04]
 [ 9.00e-05  1.11e-03 -1.55e-03 ... -9.40e-04 -1.07e-03 -8.00e-04]
 [ 4.60e-04  8.50e-04 -2.00e-03 ... -8.00e-05  1.10e-03 -9.80e-04]
 ...
 [-2.40e-04 -3.40e-04  7.80e-04 ...  1.47e-03 -8.60e-04 -7.10e-04]
 [-4.34e-03  1.08e-03 -2.20e-04 ... -2.68e-03 -1.55e-03 -2.19e-03]
 [ 0.00e+00  0.00e+00  0.00e+00 ...  0.00e+00  0.00e+00  0.00e+00]]


  del sys.path[0]


## Preparing Data before using models

In [0]:
# Prepare data for modeling

from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
np.random.seed(1)

def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                      test_size=test_data_proportion, 
                                                      random_state=42,
                                                      shuffle=True)
    return train_X, test_X, train_Y, test_Y

def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', 
        np.round(metrics.accuracy_score(true_labels, predicted_labels), 2))
    print('Precision:', 
        np.round(metrics.precision_score(true_labels,
                                         predicted_labels,
                                         average='weighted'),2))
    print('Recall:', 
        np.round(metrics.recall_score(true_labels,
                                      predicted_labels,
                                      average='weighted'),2))
    print('F1 Score:', 
        np.round(metrics.f1_score(true_labels,
                                  predicted_labels,
                                  average='weighted'),2))
    return

def train_predict_evaluate_model(classifier, train_features, 
                                 train_labels, test_features, test_labels):
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features)

    return predictions




## Defined Accuracy Report

In [0]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

def accuracy_report(yvalid, prediction):
    cm1 = confusion_matrix(yvalid, prediction)
    print('Confusion Matrix :', cm1)

    total1=sum(sum(cm1))

    accuracy1=(cm1[0,0]+cm1[1,1])/total1

    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])

    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    
    print('                 ')
    print('Accuracy:', accuracy_score(yvalid, prediction))
    print('Precision:', precision_score(yvalid, prediction))
    print('Recall:', recall_score(yvalid, prediction))
    print('F1:', f1_score(yvalid, prediction))
    return accuracy_score(yvalid, prediction), f1_score(yvalid, prediction), \
            recall_score(yvalid, prediction), precision_score(yvalid, prediction)
            

# Cross-validation with TFIDF and Word2Vec

Support vector machine, Logistic Regression with TFIDF and Word2Vec

## SVM

### SVM Cross validation with Word2Vec

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
np.random.seed(1)

params = {
        'C': [0.01, 0.1, 1., 10, 50, 100],
        'kernel': ['rbf'],
        'gamma': [0.01, 0.1, 1., 10., 100.]
}

corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels

#### New results

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
np.random.seed(1)

corpus = df['normalized_text'].values
labels = df['HS'].values

X = [nltk.word_tokenize(text) for text in corpus]
X = np.array(X)
y = labels
# averaged word vector features
kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

w2vmodel = gensim.models.Word2Vec(X,
                               size=500, 
                               min_count=2,
                               window=1, 
                               workers=4)
best_params = None

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    avg_wv_train_features = averaged_word_vectorizer(corpus=x_train, 
                                                     model=w2vmodel, 
                                                     num_features=500)
    avg_wv_test_features = averaged_word_vectorizer(corpus=x_test, 
                                                    model=w2vmodel, 
                                                    num_features=500)
    if best_params is None:
        svm = SVC(random_state=42)
        random_search = RandomizedSearchCV(svm, 
                                           param_distributions=params, 
                                           cv=4, 
                                           random_state=42)
        random_search.fit(avg_wv_train_features, y_train)
        best_params = random_search.best_params_
        print(best_params)
    
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    print('SVM results with AVG word vector')
    svm_predictions = train_predict_evaluate_model(classifier=svm, 
                                                 train_features=avg_wv_train_features, 
                                                 train_labels=y_train,
                                                 test_features=avg_wv_test_features, 
                                                 test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, svm_predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

W0608 11:32:19.446745 140007620867968 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
  del sys.path[0]


{'kernel': 'rbf', 'gamma': 100.0, 'C': 50}
SVM results with AVG word vector
Confusion Matrix : [[4 7]
 [1 8]]
                 
Accuracy: 0.6
Precision: 0.5333333333333333
Recall: 0.8888888888888888
F1: 0.6666666666666667
SVM results with AVG word vector
Confusion Matrix : [[5 5]
 [2 8]]
                 
Accuracy: 0.65
Precision: 0.6153846153846154
Recall: 0.8
F1: 0.6956521739130435
SVM results with AVG word vector
Confusion Matrix : [[5 5]
 [1 9]]
                 
Accuracy: 0.7
Precision: 0.6428571428571429
Recall: 0.9
F1: 0.75
SVM results with AVG word vector
Confusion Matrix : [[4 7]
 [0 9]]
                 
Accuracy: 0.65
Precision: 0.5625
Recall: 1.0
F1: 0.72
SVM results with AVG word vector
Confusion Matrix : [[5 3]
 [5 7]]
                 
Accuracy: 0.6
Precision: 0.7
Recall: 0.5833333333333334
F1: 0.6363636363636365
SVM Overall with Word2Vec
Acc= 0.64 , std= 0.03741657386773941
f1 score= 0.6937364953886693 , std= 0.03970754092001219
Precision= 0.6108150183150183 , std= 0.05

  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


#### Old results

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
np.random.seed(1)

corpus = df['normalized_text'].values
labels = df['HS'].values

X = [nltk.word_tokenize(text) for text in corpus]
X = np.array(X)
y = labels
# averaged word vector features
kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

w2vmodel = gensim.models.Word2Vec(X,
                               size=500, 
                               min_count=2,
                               window=1, 
                               workers=4)
best_params = None

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    avg_wv_train_features = averaged_word_vectorizer(corpus=x_train, 
                                                     model=w2vmodel, 
                                                     num_features=500)
    avg_wv_test_features = averaged_word_vectorizer(corpus=x_test, 
                                                    model=w2vmodel, 
                                                    num_features=500)
    svm = SVC(random_state=42)
    random_search = RandomizedSearchCV(svm, 
                                       param_distributions=params, 
                                       cv=4, 
                                       random_state=42)
    random_search.fit(avg_wv_train_features, y_train)
    best_params = random_search.best_params_
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    print('SVM results with AVG word vector')
    svm_predictions = train_predict_evaluate_model(classifier=svm, 
                                                 train_features=avg_wv_train_features, 
                                                 train_labels=y_train,
                                                 test_features=avg_wv_test_features, 
                                                 test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, svm_predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

W0608 11:32:28.522340 140007620867968 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
  del sys.path[0]
  del sys.path[0]


SVM results with AVG word vector
Confusion Matrix : [[4 7]
 [1 8]]
                 
Accuracy: 0.6
Precision: 0.5333333333333333
Recall: 0.8888888888888888
F1: 0.6666666666666667
SVM results with AVG word vector
Confusion Matrix : [[5 5]
 [2 8]]
                 
Accuracy: 0.65
Precision: 0.6153846153846154
Recall: 0.8
F1: 0.6956521739130435
SVM results with AVG word vector
Confusion Matrix : [[5 5]
 [1 9]]
                 
Accuracy: 0.7
Precision: 0.6428571428571429
Recall: 0.9
F1: 0.75


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


SVM results with AVG word vector
Confusion Matrix : [[4 7]
 [0 9]]
                 
Accuracy: 0.65
Precision: 0.5625
Recall: 1.0
F1: 0.72
SVM results with AVG word vector
Confusion Matrix : [[5 3]
 [5 7]]
                 
Accuracy: 0.6
Precision: 0.7
Recall: 0.5833333333333334
F1: 0.6363636363636365
SVM Overall with Word2Vec
Acc= 0.64 , std= 0.03741657386773941
f1 score= 0.6937364953886693 , std= 0.03970754092001219
Precision= 0.6108150183150183 , std= 0.0588885259164867
Recalls= 0.8344444444444445 , std= 0.14065104354174807


### SVM cross validation with TFIDF

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
np.random.seed(1)
params = {
        'C': [0.01, 0.1, 1., 10, 50, 100],
        'kernel': ['rbf'],
        'gamma': [0.01, 0.1, 1., 10., 100.]
}

corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels

#### New results

In [0]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
best_params = None
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)
    
    tfidf_train_features = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test_features = tfidf_vectorizer.transform(x_test)
    if best_params is None:
        svm = SVC()
        random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                           cv=4, random_state=42)
        random_search.fit(tfidf_train_features, y_train)
        best_params = random_search.best_params_
    
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    predictions = train_predict_evaluate_model(classifier=svm,
                                               train_features=tfidf_train_features, 
                                               train_labels=y_train,
                                               test_features=tfidf_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with TFIDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

Confusion Matrix : [[6 5]
 [3 6]]
                 
Accuracy: 0.6
Precision: 0.5454545454545454
Recall: 0.6666666666666666
F1: 0.6
Confusion Matrix : [[7 3]
 [4 6]]
                 
Accuracy: 0.65
Precision: 0.6666666666666666
Recall: 0.6
F1: 0.631578947368421
Confusion Matrix : [[6 4]
 [4 6]]
                 
Accuracy: 0.6
Precision: 0.6
Recall: 0.6
F1: 0.6
Confusion Matrix : [[5 6]
 [3 6]]
                 
Accuracy: 0.55
Precision: 0.5
Recall: 0.6666666666666666
F1: 0.5714285714285715
Confusion Matrix : [[6 2]
 [3 9]]
                 
Accuracy: 0.75
Precision: 0.8181818181818182
Recall: 0.75
F1: 0.7826086956521738
SVM Overall with TFIDF
Acc= 0.6300000000000001 , std= 0.06782329983125268
f1 score= 0.6371232428898332 , std= 0.07519151064051076
Precision= 0.6260606060606061 , std= 0.11101998099433111
Recalls= 0.6566666666666666 , std= 0.05537749241945384




#### Old results

In [0]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
best_params = None
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)
    
    tfidf_train_features = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test_features = tfidf_vectorizer.transform(x_test)
    
    svm = SVC()
    random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                       cv=4, random_state=42)
    random_search.fit(tfidf_train_features, y_train)
    best_params = random_search.best_params_
    
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    predictions = train_predict_evaluate_model(classifier=svm,
                                               train_features=tfidf_train_features, 
                                               train_labels=y_train,
                                               test_features=tfidf_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with TFIDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))




Confusion Matrix : [[6 5]
 [3 6]]
                 
Accuracy: 0.6
Precision: 0.5454545454545454
Recall: 0.6666666666666666
F1: 0.6
Confusion Matrix : [[7 3]
 [4 6]]
                 
Accuracy: 0.65
Precision: 0.6666666666666666
Recall: 0.6
F1: 0.631578947368421
Confusion Matrix : [[6 4]
 [3 7]]
                 
Accuracy: 0.65
Precision: 0.6363636363636364
Recall: 0.7
F1: 0.6666666666666666
Confusion Matrix : [[5 6]
 [3 6]]
                 
Accuracy: 0.55
Precision: 0.5
Recall: 0.6666666666666666
F1: 0.5714285714285715
Confusion Matrix : [[5 3]
 [3 9]]
                 
Accuracy: 0.7
Precision: 0.75
Recall: 0.75
F1: 0.75
SVM Overall with TFIDF
Acc= 0.6300000000000001 , std= 0.05099019513592783
f1 score= 0.6439348370927318 , std= 0.061816909240319934
Precision= 0.6196969696969696 , std= 0.08865902326259269
Recalls= 0.6766666666666666 , std= 0.04898979485566357




## Logistic Regression

In [0]:
# Defining model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

params = {
        'C': [0.01, 0.1, 1., 10, 20, 100],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l2']
}

corpus = df['normalized_text'].values
labels = df['HS'].values

X = [nltk.word_tokenize(text) for text in corpus]
X = np.array(X)
y = labels

### Losgistic Regression cross validation with Word2Vec

#### New results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

model = gensim.models.Word2Vec(X,
                               size=500, 
                               min_count=2,
                               window=1, 
                               workers=4)
best_params = None
for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    avg_wv_train_features = averaged_word_vectorizer(corpus=x_train, 
                                                     model=model, 
                                                     num_features=500)
    avg_wv_test_features = averaged_word_vectorizer(corpus=x_test, 
                                                    model=model, 
                                                    num_features=500)
    if best_params is None:
        logistic = LogisticRegression(random_state=42)
        random_search = RandomizedSearchCV(logistic, random_state=42,
                                           param_distributions=params, cv=4)
        random_search.fit(avg_wv_train_features, y_train)
        best_params = random_search.best_params_
        
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    # Averaged word vector
    print('Logistic Regression results with AVG word vector')
    # Support Vector Machine with averaged word vector features use word2vec
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=avg_wv_train_features, 
                                               train_labels=y_train,
                                               test_features=avg_wv_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('Logistic Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

W0608 11:35:43.863175 140007620867968 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


Logistic Regression results with AVG word vector
Confusion Matrix : [[ 0 11]
 [ 0  9]]
                 
Accuracy: 0.45
Precision: 0.45
Recall: 1.0
F1: 0.6206896551724138
Logistic Regression results with AVG word vector
Confusion Matrix : [[7 3]
 [3 7]]
                 
Accuracy: 0.7
Precision: 0.7
Recall: 0.7
F1: 0.7
Logistic Regression results with AVG word vector
Confusion Matrix : [[6 4]
 [1 9]]
                 
Accuracy: 0.75
Precision: 0.6923076923076923
Recall: 0.9
F1: 0.7826086956521738
Logistic Regression results with AVG word vector
Confusion Matrix : [[ 0 11]
 [ 0  9]]
                 
Accuracy: 0.45
Precision: 0.45
Recall: 1.0
F1: 0.6206896551724138
Logistic Regression results with AVG word vector
Confusion Matrix : [[ 8  0]
 [12  0]]
                 
Accuracy: 0.4
Precision: 0.0
Recall: 0.0
F1: 0.0
Logistic Overall with Word2Vec
Acc= 0.55 , std= 0.14491376746189435
f1 score= 0.5447976011994002 , std= 0.2789141571248734
Precision= 0.4584615384615384 , std= 0.25430494670

  del sys.path[0]
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Old results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

model = gensim.models.Word2Vec(X,
                               size=500, 
                               min_count=2,
                               window=1, 
                               workers=4)
for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    avg_wv_train_features = averaged_word_vectorizer(corpus=x_train, 
                                                     model=model, 
                                                     num_features=500)
    avg_wv_test_features = averaged_word_vectorizer(corpus=x_test, 
                                                    model=model, 
                                                    num_features=500)
    
    logistic = LogisticRegression(random_state=42)
    random_search = RandomizedSearchCV(logistic, random_state=42,
                                       param_distributions=params, cv=4)
    random_search.fit(avg_wv_train_features, y_train)
    best_params = random_search.best_params_
        
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    # Averaged word vector
    print('Logistic Regression results with AVG word vector')
    # Support Vector Machine with averaged word vector features use word2vec
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=avg_wv_train_features, 
                                               train_labels=y_train,
                                               test_features=avg_wv_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('Logistic Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

  del sys.path[0]


Logistic Regression results with AVG word vector
Confusion Matrix : [[ 0 11]
 [ 0  9]]
                 
Accuracy: 0.45
Precision: 0.45
Recall: 1.0
F1: 0.6206896551724138
Logistic Regression results with AVG word vector
Confusion Matrix : [[8 2]
 [3 7]]
                 
Accuracy: 0.75
Precision: 0.7777777777777778
Recall: 0.7
F1: 0.7368421052631577


  del sys.path[0]
  del sys.path[0]


Logistic Regression results with AVG word vector
Confusion Matrix : [[4 6]
 [1 9]]
                 
Accuracy: 0.65
Precision: 0.6
Recall: 0.9
F1: 0.7200000000000001
Logistic Regression results with AVG word vector


  del sys.path[0]


Confusion Matrix : [[ 0 11]
 [ 0  9]]
                 
Accuracy: 0.45
Precision: 0.45
Recall: 1.0
F1: 0.6206896551724138
Logistic Regression results with AVG word vector
Confusion Matrix : [[ 8  0]
 [12  0]]
                 
Accuracy: 0.4
Precision: 0.0
Recall: 0.0
F1: 0.0
Logistic Overall with Word2Vec
Acc= 0.54 , std= 0.13564659966250536
f1 score= 0.5396442831215971 , std= 0.27414150456858233
Precision= 0.4555555555555556 , std= 0.257792145193481
Recalls= 0.72 , std= 0.3762977544445356


  del sys.path[0]
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Logistic cross validation with TFIDF

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
np.random.seed(1)
params = {
        'C': [0.01, 0.1, 1., 10, 20, 100],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l2']
}
corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels

#### New results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

best_params = None
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)
    
    tfidf_train_features = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test_features = tfidf_vectorizer.transform(x_test)
    
    if best_params is None:
        logistic = LogisticRegression(random_state=42)
        random_search = RandomizedSearchCV(logistic, param_distributions=params, 
                                           cv=4, random_state=42)
        random_search.fit(tfidf_train_features, y_train)
        best_params = random_search.best_params_
        print(best_params)
    
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=tfidf_train_features, 
                                               train_labels=y_train,
                                               test_features=tfidf_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('Ligistic Regression Overall with TFIDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))


{'solver': 'liblinear', 'penalty': 'l2', 'C': 100}
Confusion Matrix : [[6 5]
 [2 7]]
                 
Accuracy: 0.65
Precision: 0.5833333333333334
Recall: 0.7777777777777778
F1: 0.6666666666666666
Confusion Matrix : [[8 2]
 [3 7]]
                 
Accuracy: 0.75
Precision: 0.7777777777777778
Recall: 0.7
F1: 0.7368421052631577
Confusion Matrix : [[6 4]
 [2 8]]
                 
Accuracy: 0.7
Precision: 0.6666666666666666
Recall: 0.8
F1: 0.7272727272727272
Confusion Matrix : [[7 4]
 [2 7]]
                 
Accuracy: 0.7
Precision: 0.6363636363636364
Recall: 0.7777777777777778
F1: 0.7000000000000001
Confusion Matrix : [[ 4  4]
 [ 2 10]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.8333333333333334
F1: 0.7692307692307692
Ligistic Regression Overall with TFIDF
Acc= 0.7 , std= 0.031622776601683784
f1 score= 0.7200024536866642 , std= 0.0346544719338603
Precision= 0.6756854256854257 , std= 0.0664368583352485
Recalls= 0.7777777777777778 , std= 0.04388537257362558




#### Old results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

best_params = None
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)
    
    tfidf_train_features = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test_features = tfidf_vectorizer.transform(x_test)
    
    logistic = LogisticRegression(random_state=42)
    random_search = RandomizedSearchCV(logistic, param_distributions=params, 
                                       cv=4, random_state=42)
    random_search.fit(tfidf_train_features, y_train)
    best_params = random_search.best_params_
    print(best_params)
    
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=tfidf_train_features, 
                                               train_labels=y_train,
                                               test_features=tfidf_test_features, 
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('Ligistic Regression Overall with TFIDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))




{'solver': 'liblinear', 'C': 100}
Confusion Matrix : [[6 5]
 [2 7]]
                 
Accuracy: 0.65
Precision: 0.5833333333333334
Recall: 0.7777777777777778
F1: 0.6666666666666666
{'solver': 'lbfgs', 'C': 10}
Confusion Matrix : [[7 3]
 [3 7]]
                 
Accuracy: 0.7
Precision: 0.7
Recall: 0.7
F1: 0.7
{'solver': 'liblinear', 'C': 10}
Confusion Matrix : [[6 4]
 [2 8]]
                 
Accuracy: 0.7
Precision: 0.6666666666666666
Recall: 0.8
F1: 0.7272727272727272




{'solver': 'lbfgs', 'C': 10}
Confusion Matrix : [[7 4]
 [3 6]]
                 
Accuracy: 0.65
Precision: 0.6
Recall: 0.6666666666666666
F1: 0.631578947368421
{'solver': 'lbfgs', 'C': 100}
Confusion Matrix : [[ 4  4]
 [ 2 10]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.8333333333333334
F1: 0.7692307692307692
Ligistic Regression Overall with TFIDF
Acc= 0.6799999999999999 , std= 0.024494897427831747
f1 score= 0.6989498221077168 , std= 0.047589364878035056
Precision= 0.6528571428571428 , std= 0.05256245610113001
Recalls= 0.7555555555555555 , std= 0.062459863655800904




## LSTM with TFIDF and Word2Vec

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM
import keras

callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='lstm_w2vmodel_best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_acc', save_best_only=True),
#     keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
]

LSTM_DIM = 512 # total LSTM units

class LSTM_No_MC():
    def __init__(self, vocab_size=None, embedding_matrix=None):
        self._build_LSTM_model(vocab_size, embedding_matrix)

    def _build_LSTM_model(self, vocab_size=None, embedding_matrix=None):
        self.model = Sequential() 
        if vocab_size is not None:
            self.model.add(Embedding(vocab_size,
                                    LSTM_DIM, 
                                    weights=[embedding_matrix],
                                    trainable=True))
        self.model.add(LSTM(LSTM_DIM,
                       recurrent_dropout=0.2,
                       activation='tanh'))
                           
        self.model.add(Dense(LSTM_DIM, activation='tanh')) 
        self.model.add(Dropout(0.1))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', 
                            optimizer='adam', 
                            metrics=['accuracy'])
        return self
    
    def fit(self, train_X, train_y):
        self.model.fit(train_X, train_y, 
                       epochs=15, 
                       batch_size=10, 
                       shuffle=True, 
                       verbose=0,
#                        callbacks=callbacks_list,
#                        validation_split=0.1
                      )

    def predict(self, x_test):
        predictions = self.model.predict(x_test)
        return predictions
    
    def evaluate(self, x_test, y_test):
        predictions = self.predict(x_test)
        predictions[predictions >= 0.5] = 1
        predictions[predictions < 0.5] = 0
    
        acc, f1, recall, precision = accuracy_report(y_test, predictions)
        return acc, f1, recall, precision

### Word2Vec

In [0]:
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
from keras.preprocessing.sequence import pad_sequences
np.random.seed(1)

corpus = df['normalized_text'].values
labels = df['HS'].values

kf = KFold(n_splits=5, random_state=42, shuffle=True)

MAX_NB_WORDS = 512
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 512

texts = corpus
sentences_as_words = []
for sent in texts:
    temp = sent.split()
    sentences_as_words.append(temp)
    
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

w2vmodel = gensim.models.Word2Vec(sentences=sentences_as_words,
                                   size=EMBEDDING_DIM, 
                                   workers=4,
                                   min_count=2,
                                   window=1)

W0329 02:47:05.782345 139832299702144 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


[[48, 5, 9, 29, 20], [5, 15, 157, 76, 77, 158, 159, 78, 3, 36, 48, 5], [25, 11, 160, 16, 161, 3, 162, 163, 164, 49, 79, 25, 165, 3, 166, 167], [168, 30, 8, 169, 3, 1, 4, 50, 8, 80, 3, 36], [11, 6, 170, 10, 13, 37, 171], [172, 173, 174, 175, 176, 177, 51, 178, 10, 81, 82, 179, 6, 6], [31, 180, 181, 6, 8], [52, 3, 36, 8], [2, 6, 3, 38, 83], [49, 53, 3, 182, 183, 184, 185, 186, 187, 188, 13, 84, 189], [39, 1, 85, 10, 190, 86, 191, 1, 8, 17, 54, 31, 55, 192], [5, 21, 8, 193], [5, 21, 194, 8, 29, 20, 17, 195, 3, 196], [197, 10, 87, 198], [15, 56, 56, 10, 199, 3, 200], [40, 88, 201, 11, 202, 86, 21, 13, 57, 17, 54], [1, 203, 2, 17, 54, 10, 10, 29], [1, 6, 58, 5, 204, 205, 206, 59, 207, 208, 209, 60, 210], [5, 211, 212, 38, 213, 214, 215, 61], [10, 216, 217, 38, 10, 89, 90, 218, 1, 17, 62, 79, 219], [8, 76, 220, 221, 91, 222, 62, 6, 223], [1, 6, 224, 13, 225, 226, 1, 6, 58, 38, 58], [22, 91, 227, 63, 228, 1, 229, 10, 29], [6, 230, 231, 92, 232, 233, 93, 10, 234, 235, 5, 236, 32, 237], [238, 2

In [0]:
X = data
y = labels

In [0]:
embedding_matrix = np.zeros((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = w2vmodel[word]
    except:
        pass
    try:
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    except:
        pass

  after removing the cwd from sys.path.


In [0]:
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
np.random.seed(1)

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LSTM_No_MC(vocab_size=MAX_SEQUENCE_LENGTH,
                      embedding_matrix=embedding_matrix)
    model.fit(x_train, y_train)

    print('*'*10)
    print('Evaluation model ', count + 1, '/5')
    count += 1

    acc, f1, recall, precision = model.evaluate(x_test, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)

print('LSTM Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

**********
Evaluation model  1 /5
Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
**********
Evaluation model  2 /5
Confusion Matrix : [[7 6]
 [3 4]]
                 
Accuracy: 0.55
Precision: 0.4
Recall: 0.5714285714285714
F1: 0.47058823529411764
**********
Evaluation model  3 /5
Confusion Matrix : [[11  3]
 [ 2  4]]
                 
Accuracy: 0.75
Precision: 0.5714285714285714
Recall: 0.6666666666666666
F1: 0.6153846153846153
**********
Evaluation model  4 /5
Confusion Matrix : [[ 4  2]
 [ 4 10]]
                 
Accuracy: 0.7
Precision: 0.8333333333333334
Recall: 0.7142857142857143
F1: 0.7692307692307692
**********
Evaluation model  5 /5
Confusion Matrix : [[4 2]
 [5 9]]
                 
Accuracy: 0.65
Precision: 0.8181818181818182
Recall: 0.6428571428571429
F1: 0.7200000000000001
LSTM Overall with Word2Vec
Acc= 0.67 , std= 0.06782329983125265
f1 score= 0.6400407239819005 , std= 0.1

### TFIDF

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
np.random.seed(1)
corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels
kf = KFold(n_splits=5, random_state=42, shuffle=True)

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)
X = tfidf_vectorizer.fit_transform(X)
X = X.todense()
X = np.asarray(X)

In [0]:
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    x_train = np.reshape(x_train,
                        (x_train.shape[0], 1, 
                         x_train.shape[1]))
    x_test = np.reshape(x_test, 
                        (x_test.shape[0], 1, 
                         x_test.shape[1]))

    model = LSTM_No_MC()
    model.fit(x_train, y_train)

    print('*'*10)
    print('Evaluation model ', count + 1, '/5')
    count += 1

    acc, f1, recall, precision = model.evaluate(x_test, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
print('LSTM Overall with TF-IDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

**********
Evaluation model  1 /5
Confusion Matrix : [[5 6]
 [2 7]]
                 
Accuracy: 0.6
Precision: 0.5384615384615384
Recall: 0.7777777777777778
F1: 0.6363636363636364
**********
Evaluation model  2 /5
Confusion Matrix : [[9 1]
 [3 7]]
                 
Accuracy: 0.8
Precision: 0.875
Recall: 0.7
F1: 0.7777777777777777
**********
Evaluation model  3 /5
Confusion Matrix : [[6 4]
 [1 9]]
                 
Accuracy: 0.75
Precision: 0.6923076923076923
Recall: 0.9
F1: 0.7826086956521738
**********
Evaluation model  4 /5
Confusion Matrix : [[9 2]
 [5 4]]
                 
Accuracy: 0.65
Precision: 0.6666666666666666
Recall: 0.4444444444444444
F1: 0.5333333333333333
**********
Evaluation model  5 /5
Confusion Matrix : [[4 4]
 [3 9]]
                 
Accuracy: 0.65
Precision: 0.6923076923076923
Recall: 0.75
F1: 0.7199999999999999
LSTM Overall with TF-IDF
Acc= 0.69 , std= 0.07348469228349536
f1 score= 0.6900166886253841 , std= 0.09444851649234885
Precision= 0.6929487179487179 , std=

## LSTM Dropout

### Word2Vec

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM
from keras.regularizers import l2
import keras.backend as K
np.random.seed(1)
LSTM_DIM = 512 # total LSTM units

callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='Lstm_MC_w2vmodel_best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_acc', save_best_only=True),
#     keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
]
class LSTM_MC():
    def __init__(self, vocab_size=None, embedding_matrix=None):
        self._build_LSTM_model(vocab_size, embedding_matrix)

    def _build_LSTM_model(self, vocab_size=None, embedding_matrix=None):
        self.model = Sequential()
        if vocab_size is not None:
            self.model.add(Embedding(vocab_size,
                                    LSTM_DIM, weights=[embedding_matrix],
                                    trainable=True))
        self.model.add(LSTM(LSTM_DIM,
                            recurrent_dropout=0.2,
                            kernel_regularizer=l2(1e-4),
                            bias_regularizer=l2(1e-4),
                            dropout=0.5,
                           activation='tanh'))
        self.model.add(Dense(LSTM_DIM, activation='tanh',
                            kernel_regularizer=l2(1e-4),
                            bias_regularizer=l2(1e-4)
                            ))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(1, kernel_regularizer=l2(1e-4),
                             activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', 
                            optimizer='adam', 
                            metrics=['accuracy'])
        return self
    
    def fit(self, train_X, train_y):
        self.model.fit(train_X, train_y, 
                       epochs=25, 
                       batch_size=10, 
                       shuffle=True, 
                       verbose=0,
                      )

    def predict(self, x_test):
        predictions = self.model.predict(x_test)
        return predictions
    
    def evaluate(self, test_X, y_test):
        T = 100
        predict_stochastic = K.function([self.model.layers[0].input,
                                        K.learning_phase()],
                                        [self.model.layers[-1].output])
#         Yt_hat = np.array([predict_stochastic([test_X, 1]) for _ in range(T)])
        Yt_hat = np.array([self.model.predict(test_X) for _ in range(T)])
        MC_pred = np.mean(Yt_hat, axis=0)
        MC_pred = MC_pred.reshape(-1, 1)
        MC_pred[MC_pred >= 0.5] = 1
        MC_pred[MC_pred < 0.5] = 0
    
        acc, f1, recall, precision = accuracy_report(y_test, 
                                                     MC_pred)
        return acc, f1, recall, precision

In [0]:
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
from keras.preprocessing.sequence import pad_sequences
np.random.seed(1)

corpus = df['normalized_text'].values
labels = df['HS'].values

kf = KFold(n_splits=5, random_state=42, shuffle=True)

MAX_NB_WORDS = 512
MAX_SEQUENCE_LENGTH = 512
EMBEDDING_DIM = 512

texts = corpus
sentences_as_words = []
for sent in texts:
    temp = sent.split()
    sentences_as_words.append(temp)
    
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

w2vmodel = gensim.models.Word2Vec(sentences=sentences_as_words,
                                  size=EMBEDDING_DIM, 
                                  workers=4,
                                  min_count=2,
                                  window=1)

W0329 04:53:14.105977 139832299702144 base_any2vec.py:1386] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


Found 670 unique tokens.
Shape of data tensor: (100, 512)
Shape of label tensor: (100,)


In [0]:
X = data
y = labels

In [0]:
embedding_matrix = np.zeros((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = w2vmodel[word]
    except:
        pass
    try:
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector
    except:
        pass

  after removing the cwd from sys.path.


In [0]:
print(embedding_matrix)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 9.87375970e-04 -9.85599589e-04  7.90039427e-04 ... -6.06902060e-04
   1.53743866e-04 -4.28821921e-04]
 [-1.58300652e-04 -1.46128450e-05  4.51093976e-04 ...  1.08289336e-04
  -4.40097443e-04 -6.23544503e-04]
 ...
 [-6.16504520e-04 -4.46031481e-04 -3.23501736e-05 ...  6.12647971e-04
  -2.90823606e-04  7.96247507e-04]
 [-6.16504520e-04 -4.46031481e-04 -3.23501736e-05 ...  6.12647971e-04
  -2.90823606e-04  7.96247507e-04]
 [-6.16504520e-04 -4.46031481e-04 -3.23501736e-05 ...  6.12647971e-04
  -2.90823606e-04  7.96247507e-04]]


In [0]:
np.random.seed(1)
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LSTM_MC(vocab_size=MAX_SEQUENCE_LENGTH, 
                    embedding_matrix=embedding_matrix)
#     print(model.model.summary())
    model.fit(x_train, y_train)
    
    print('*'*10)
    print('Evaluation model ', count + 1, '/5')
    count += 1

    acc, f1, recall, precision = model.evaluate(x_test, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)

print('LSTM MC Dropout Overall with Word2Vec')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

**********
Evaluation model  1 /5
Confusion Matrix : [[8 3]
 [3 6]]
                 
Accuracy: 0.7
Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1: 0.6666666666666666
**********
Evaluation model  2 /5
Confusion Matrix : [[7 6]
 [2 5]]
                 
Accuracy: 0.6
Precision: 0.45454545454545453
Recall: 0.7142857142857143
F1: 0.5555555555555556
**********
Evaluation model  3 /5
Confusion Matrix : [[11  3]
 [ 1  5]]
                 
Accuracy: 0.8
Precision: 0.625
Recall: 0.8333333333333334
F1: 0.7142857142857143
**********
Evaluation model  4 /5
Confusion Matrix : [[ 4  2]
 [ 4 10]]
                 
Accuracy: 0.7
Precision: 0.8333333333333334
Recall: 0.7142857142857143
F1: 0.7692307692307692
**********
Evaluation model  5 /5
Confusion Matrix : [[4 2]
 [5 9]]
                 
Accuracy: 0.65
Precision: 0.8181818181818182
Recall: 0.6428571428571429
F1: 0.7200000000000001
LSTM MC Dropout Overall with Word2Vec
Acc= 0.69 , std= 0.06633249580710801
f1 score= 0.6851477411477412

### TFIDF

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM
from keras.regularizers import l2
import keras.backend as K
np.random.seed(1)
LSTM_DIM = 512 # total LSTM units

callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='Lstm_MC_w2vmodel_best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_acc', save_best_only=True),
#     keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
]
class LSTM_MC():
    def __init__(self, vocab_size=None, embedding_matrix=None):
        self._build_LSTM_model(vocab_size, embedding_matrix)

    def _build_LSTM_model(self, vocab_size=None, embedding_matrix=None):
        self.model = Sequential()
        if vocab_size is not None:
            self.model.add(Embedding(vocab_size,
                                    LSTM_DIM, weights=[embedding_matrix],
                                    trainable=True))
        self.model.add(LSTM(LSTM_DIM,
                            recurrent_dropout=0.2,
                            kernel_regularizer=l2(1e-4),
                            bias_regularizer=l2(1e-4),
                            dropout=0.5,
                           activation='tanh'))
        self.model.add(Dense(LSTM_DIM, activation='tanh',
                            kernel_regularizer=l2(1e-4),
                            bias_regularizer=l2(1e-4)
                            ))
        self.model.add(Dropout(rate=0.5))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', 
                            optimizer='adam', 
                            metrics=['accuracy'])
        return self
    
    def fit(self, train_X, train_y):
        self.model.fit(train_X, train_y, 
                       epochs=20, 
                       batch_size=10, 
                       shuffle=True, 
                       verbose=0,
#                        callbacks=callbacks_list
                      )

    def predict(self, x_test):
        predictions = self.model.predict(x_test)
        return predictions
    
    def evaluate(self, test_X, y_test):
        T = 100
        predict_stochastic = K.function([self.model.layers[0].input,
                                        K.learning_phase()],
                                        [self.model.layers[-1].output])
#         Yt_hat = np.array([predict_stochastic([test_X, 1]) for _ in range(T)])
        Yt_hat = np.array([self.model.predict(test_X) for _ in range(T)])
        MC_pred = np.mean(Yt_hat, axis=0)
        MC_pred = MC_pred.reshape(-1, 1)
        MC_pred[MC_pred >= 0.5] = 1
        MC_pred[MC_pred < 0.5] = 0
    
        acc, f1, recall, precision = accuracy_report(y_test, 
                                                     MC_pred)
        return acc, f1, recall, precision

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
np.random.seed(1)

corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels
kf = KFold(n_splits=5, random_state=42, shuffle=True)


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                   max_features=300)

X = tfidf_vectorizer.fit_transform(X)
X = X.todense()
X = np.asarray(X)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    x_train = np.reshape(x_train,
                        (x_train.shape[0], 1, 
                         x_train.shape[1]))

    x_test = np.reshape(x_test, (x_test.shape[0], 1, 
                                 x_test.shape[1]))

    model = LSTM_MC()
    model.fit(x_train, y_train)

    # Prediction on the test set.
    print('*'*10)
    print('Evaluation model ', count + 1, '/5')
    count += 1

    acc, f1, recall, precision = model.evaluate(x_test, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)

print('LSTM MC Dropout Overall with TF-IDF')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

**********
Evaluation model  1 /5
Confusion Matrix : [[5 6]
 [2 7]]
                 
Accuracy: 0.6
Precision: 0.5384615384615384
Recall: 0.7777777777777778
F1: 0.6363636363636364
**********
Evaluation model  2 /5
Confusion Matrix : [[7 3]
 [1 9]]
                 
Accuracy: 0.8
Precision: 0.75
Recall: 0.9
F1: 0.8181818181818182
**********
Evaluation model  3 /5
Confusion Matrix : [[5 5]
 [1 9]]
                 
Accuracy: 0.7
Precision: 0.6428571428571429
Recall: 0.9
F1: 0.75
**********
Evaluation model  4 /5
Confusion Matrix : [[10  1]
 [ 4  5]]
                 
Accuracy: 0.75
Precision: 0.8333333333333334
Recall: 0.5555555555555556
F1: 0.6666666666666667
**********
Evaluation model  5 /5
Confusion Matrix : [[2 6]
 [4 8]]
                 
Accuracy: 0.5
Precision: 0.5714285714285714
Recall: 0.6666666666666666
F1: 0.6153846153846153
LSTM MC Dropout Overall with TF-IDF
Acc= 0.6699999999999999 , std= 0.1077032961426901
f1 score= 0.6973193473193473 , std= 0.07583069807982645
Precision= 

# SVM and Logistic with ELMo

## Define ELMo

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import tensorflow_hub as hub
import tensorflow as tf
np.random.seed(1)

elmo = hub.Module("https://tfhub.dev/google/elmo/2")

In [0]:
def get_embedding_from_elmo(words_to_embed, elmo=elmo):
#     embedding_tensor = elmo(words_to_embed) # <-- removed other params
    embeddings = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for w in words_to_embed:
            try:
                embedding = sess.run(elmo(w))
                embeddings.append(embedding)
            except:
                print(w)
    sess.close() 
    return np.array(embeddings)

In [0]:
words_to_embed = [["dog", "cat"], ["sloth"]] 
embeddings = get_embedding_from_elmo(words_to_embed)
print(embeddings)
print(embeddings.shape)

[array([[ 0.62332994, -0.69219965,  0.46805114, ...,  0.06314384,
        -0.18220183,  0.40788883],
       [ 0.92854553, -0.10769853, -0.08331862, ...,  0.17715633,
        -0.26392198,  0.6423401 ]], dtype=float32)
 array([[0.380059  , 0.16009209, 0.5363406 , ..., 0.10853949, 0.37611425,
        0.23169182]], dtype=float32)]
(2,)


## Cross-Validation for Logistic Regression

In [0]:
corpus = df['normalized_text'].values
labels = df['HS'].values

dataset = corpus.copy()
y = labels

embeddings = elmo(
    corpus.copy(),
    signature="default",
    as_dict=True)["default"]

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    X = sess.run(embeddings)
sess.close()
print(X)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0608 11:38:56.306629 140007620867968 saver.py:1483] Saver not created because there are no variables in the graph to restore


[[ 0.14557736 -0.6152045   0.7554845  ... -0.11874242  0.35724452
   0.01485758]
 [ 0.08743453 -0.43605867  0.23102832 ... -0.07607505  0.06322032
   0.04236133]
 [ 0.29741332 -0.3100091   0.10979225 ...  0.1368657   0.22820032
   0.3504001 ]
 ...
 [-0.25952473 -0.07544519  0.09260989 ...  0.23549713  0.4060844
  -0.0263662 ]
 [ 0.32088628  0.11954337  0.16007017 ...  0.0176918  -0.23178959
   0.1458702 ]
 [-0.4608928  -0.7838666   0.7523269  ...  0.19918656  0.07722344
  -0.14978744]]


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
np.random.seed(1)
params = {
        'C': [0.05, 0.1, 1., 10, 50, 100],
        'solver': ['lbfgs', 'liblinear'],
#         'penalty': ['l2']
}
y = labels

#### New results

In [0]:
np.random.seed(1)
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
X_data = X.copy()

kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)

best_params = None
for train_index, test_index in kf.split(X_data, y):
    x_train, x_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    if best_params is None:
        logistic = LogisticRegression(random_state=42,
                                      max_iter=200)
        random_search = RandomizedSearchCV(logistic, 
                                           param_distributions=params, 
                                           cv=4, 
                                           random_state=42)
        random_search.fit(x_train, 
                          y_train)
        best_params = random_search.best_params_
        print(best_params)
        
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], 
#                                  penalty=params['penalty'],
                                 random_state=42, 
                                 max_iter=200)
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=x_train,
                                               train_labels=y_train,
                                               test_features=x_test,
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
print('Ligistic Regression Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))



{'solver': 'lbfgs', 'C': 50}
Confusion Matrix : [[8 3]
 [2 7]]
                 
Accuracy: 0.75
Precision: 0.7
Recall: 0.7777777777777778
F1: 0.7368421052631577
Confusion Matrix : [[7 3]
 [4 6]]
                 
Accuracy: 0.65
Precision: 0.6666666666666666
Recall: 0.6
F1: 0.631578947368421
Confusion Matrix : [[5 5]
 [2 8]]
                 
Accuracy: 0.65
Precision: 0.6153846153846154
Recall: 0.8
F1: 0.6956521739130435
Confusion Matrix : [[4 7]
 [3 6]]
                 
Accuracy: 0.5
Precision: 0.46153846153846156
Recall: 0.6666666666666666
F1: 0.5454545454545455
Confusion Matrix : [[5 3]
 [3 9]]
                 
Accuracy: 0.7
Precision: 0.75
Recall: 0.75
F1: 0.75
Ligistic Regression Overall with ELMo
Acc= 0.65 , std= 0.08366600265340755
f1 score= 0.6719055543998336 , std= 0.07549237432731189
Precision= 0.6387179487179487 , std= 0.0988507265562419
Recalls= 0.7188888888888889 , std= 0.0746679894062731


#### Old results

In [0]:
np.random.seed(1)
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
X_data = X.copy()

kf = KFold(n_splits=5, random_state=42, shuffle=True)

best_results = None
for train_index, test_index in kf.split(X_data, y):
    x_train, x_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    logistic = LogisticRegression(random_state=42, max_iter=200)
    random_search = RandomizedSearchCV(logistic, param_distributions=params, 
                                       cv=4, random_state=42)
    random_search.fit(x_train, y_train)
    best_params = random_search.best_params_
        
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42, max_iter=200)
    predictions = train_predict_evaluate_model(classifier=logistic, 
                                               train_features=x_train,
                                               train_labels=y_train,
                                               test_features=x_test,
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
print('Ligistic Regression Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))



Confusion Matrix : [[8 3]
 [4 5]]
                 
Accuracy: 0.65
Precision: 0.625
Recall: 0.5555555555555556
F1: 0.5882352941176471




Confusion Matrix : [[7 3]
 [3 7]]
                 
Accuracy: 0.7
Precision: 0.7
Recall: 0.7
F1: 0.7




Confusion Matrix : [[5 5]
 [2 8]]
                 
Accuracy: 0.65
Precision: 0.6153846153846154
Recall: 0.8
F1: 0.6956521739130435




Confusion Matrix : [[4 7]
 [3 6]]
                 
Accuracy: 0.5
Precision: 0.46153846153846156
Recall: 0.6666666666666666
F1: 0.5454545454545455




Confusion Matrix : [[5 3]
 [5 7]]
                 
Accuracy: 0.6
Precision: 0.7
Recall: 0.5833333333333334
F1: 0.6363636363636365
Ligistic Regression Overall with ELMo
Acc= 0.62 , std= 0.06782329983125268
f1 score= 0.6331411299697745 , std= 0.060155789183716925
Precision= 0.6203846153846154 , std= 0.08712705814128596
Recalls= 0.6611111111111111 , std= 0.08720629720154925




## Cross-Validation for SVM

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
np.random.seed(1)
params = {
        'C': [0.01, 0.1, 1., 10, 50, 100],
        'kernel': ['rbf'],
        'gamma': [0.01, 0.1, 1., 10., 100.]
}

#### New results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
X_data = X.copy()

best_params = None

for train_index, test_index in kf.split(X_data, y):
    x_train, x_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y[train_index], y[test_index]

    if best_params is None:
        svm = SVC(random_state=42)
        random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                           cv=4, random_state=42)
        random_search.fit(x_train, y_train)
        best_params = random_search.best_params_
        
    svm = SVC(kernel=best_params['kernel'], 
              C=best_params['C'],
              gamma=best_params['gamma'],
              random_state=42)
    predictions = train_predict_evaluate_model(classifier=svm, 
                                               train_features=x_train,
                                               train_labels=y_train,
                                               test_features=x_test,
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

Confusion Matrix : [[8 3]
 [1 8]]
                 
Accuracy: 0.8
Precision: 0.7272727272727273
Recall: 0.8888888888888888
F1: 0.7999999999999999
Confusion Matrix : [[7 3]
 [4 6]]
                 
Accuracy: 0.65
Precision: 0.6666666666666666
Recall: 0.6
F1: 0.631578947368421
Confusion Matrix : [[4 6]
 [2 8]]
                 
Accuracy: 0.6
Precision: 0.5714285714285714
Recall: 0.8
F1: 0.6666666666666666
Confusion Matrix : [[4 7]
 [4 5]]
                 
Accuracy: 0.45
Precision: 0.4166666666666667
Recall: 0.5555555555555556
F1: 0.4761904761904762
Confusion Matrix : [[4 4]
 [4 8]]
                 
Accuracy: 0.6
Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1: 0.6666666666666666
SVM Overall with ELMo
Acc= 0.6200000000000001 , std= 0.11224972160321826
f1 score= 0.648220551378446 , std= 0.10352662374048581
Precision= 0.6097402597402597 , std= 0.10866490725069582
Recalls= 0.7022222222222221 , std= 0.12460307350112165




#### Old results

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)

count = 0
mean_acc = []
f1s = []
precis = []
recalls = []
X_data = X.copy()
for train_index, test_index in kf.split(X_data, y):
    x_train, x_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y[train_index], y[test_index]

    svm = SVC(random_state=42)
    random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                       cv=4, random_state=42)
    random_search.fit(x_train, y_train)
    best_params = random_search.best_params_
    print(best_params)
    svm = SVC(kernel=best_params['kernel'], 
              C=best_params['C'],
              gamma=best_params['gamma'],
              random_state=42)
    predictions = train_predict_evaluate_model(classifier=svm, 
                                               train_features=x_train,
                                               train_labels=y_train,
                                               test_features=x_test,
                                               test_labels=y_test)
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    
    
print('SVM Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))



{'kernel': 'rbf', 'gamma': 1.0, 'C': 10}
Confusion Matrix : [[ 0 11]
 [ 0  9]]
                 
Accuracy: 0.45
Precision: 0.45
Recall: 1.0
F1: 0.6206896551724138
{'kernel': 'rbf', 'gamma': 0.01, 'C': 5}
Confusion Matrix : [[7 3]
 [3 7]]
                 
Accuracy: 0.7
Precision: 0.7
Recall: 0.7
F1: 0.7
{'kernel': 'rbf', 'gamma': 0.01, 'C': 200}
Confusion Matrix : [[5 5]
 [2 8]]
                 
Accuracy: 0.65
Precision: 0.6153846153846154
Recall: 0.8
F1: 0.6956521739130435




{'kernel': 'rbf', 'gamma': 0.01, 'C': 200}
Confusion Matrix : [[5 6]
 [4 5]]
                 
Accuracy: 0.5
Precision: 0.45454545454545453
Recall: 0.5555555555555556
F1: 0.5
{'kernel': 'rbf', 'gamma': 'auto', 'C': 10}
Confusion Matrix : [[ 5  3]
 [ 1 11]]
                 
Accuracy: 0.8
Precision: 0.7857142857142857
Recall: 0.9166666666666666
F1: 0.8461538461538461
SVM Overall with ELMo
Acc= 0.6199999999999999 , std= 0.12884098726725127
f1 score= 0.6724991350478607 , std= 0.11303848306513685
Precision= 0.6011288711288711 , std= 0.1329489641570481
Recalls= 0.7944444444444444 , std= 0.15697762677732766




# LSTM MC Dropout with ELMo

## Define ELMo Embedding model

In [0]:
import keras
import logging
# Import our dependencies
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np
from keras.layers import LSTM, Dropout
from keras.layers import Embedding, Dense, TimeDistributed
from keras.layers import Lambda
from keras.layers.merge import add
from keras.regularizers import l2
np.random.seed(1)
tf.get_logger().setLevel(logging.ERROR)
# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.
W0329 10:56:16.215198 140222058874752 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [0]:
def ELMoEmbedding(x):
    elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", 
                            trainable=True)
    return elmo_model(tf.squeeze(tf.cast(x, tf.string)), 
                      signature='default',
                      as_dict=True)["elmo"]

In [0]:
class ElmoEmbeddingLayer(keras.engine.Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable = True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', 
                               trainable=self.trainable, 
                               name="{}_module".format(self.name))
        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)
    
    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['elmo']
        return result
    
    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

## Define LSTM with MC Dropout 

In [0]:
def build_LSTM_MC():
    input_text = layers.Input(shape=(1,), dtype='string')
    embedding = layers.Lambda(ELMoEmbedding, output_shape=(None, 1024))(input_text) 
    x = LSTM(units=1024, 
             kernel_regularizer=l2(1e-4),
             bias_regularizer=l2(1e-4),
             return_sequences=False,
             recurrent_dropout=0.5, 
             dropout=0.2,
             activation='tanh')(embedding)
    dense = Dense(1024, 
                  kernel_regularizer=l2(1e-4),
                  bias_regularizer=l2(1e-4),
                  activation='tanh')(x)
    dropout = Dropout(0.5)(dense)
    out = Dense(1, activation="sigmoid")(dropout)
    model = Model(input_text, out)

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics= ['acc'])
    return model

    
def lstm_fit(model, X_train_pad,y_train):
    model.fit(X_train_pad,
              y_train,
              shuffle=True,
              epochs=25,
              batch_size=4,
              verbose=0
             )
    return model

def lstm_predict(model, X_test_pad, y_test):
    T = 500
    predict_stochastic = K.function([model.layers[0].input,
                                    K.learning_phase()],
                                    [model.layers[-1].output])
    Yt_hat = np.array([predict_stochastic([X_test_pad, 1]) for _ in range(T)])
    MC_pred = np.mean(Yt_hat, axis=0)

    MC_pred = MC_pred.reshape(-1, 1)

    MC_pred[MC_pred >= 0.5] = 1
    MC_pred[MC_pred < 0.5] = 0
    return accuracy_report(y_test, MC_pred)

## Cross validation for LSTM with MC Dropout

In [0]:
corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels

In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    train_text = [' '.join(t.split()[0:]) for t in x_train.tolist()]
    train_text = np.array(train_text, dtype=object)[:, np.newaxis]
    test_text = [' '.join(t.split()[0:]) for t in x_test.tolist()]
    test_text = np.array(test_text, dtype=object)[:, np.newaxis]

    model = build_LSTM_MC()
    print(model.summary())
    lstm_fit(model, train_text, y_train)
    print('eps ', count + 1, ' fitting') 
    acc, f1, recall, precision = lstm_predict(model, test_text, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    count += 1
    
    
print('FFNN Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

# LSTM

## Define LSTM model with ELMo Embedding

In [0]:
import keras
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np
# from keras.layers import LSTM, Dropout
from keras.layers import LSTM, Embedding, Dense, Dropout, Lambda
from keras.layers.merge import add
from keras.regularizers import l2

np.random.seed(1)


class LSTM_Model:
    def __init__(self):
        self.model = self._build()
        
    def _build(self):
        input_text = layers.Input(shape=(1,), dtype='string')
        embedding = layers.Lambda(ELMoEmbedding, 
                                  output_shape=(None, 1024))(input_text) 
        x = LSTM(units=256, 
                 activation='tanh',
                 recurrent_dropout=0.1)(embedding)
        dense = Dense(256, activation='tanh')(x)
        dense = Dropout(0.1)(dense)
        out = Dense(1, activation="sigmoid")(dense)
        model = Model(input_text, out)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics= ['acc'])
        return model

    def fit(self, X_train_pad,y_train):
        self.model.fit(X_train_pad,
                       y_train,
                       shuffle=True,
                       epochs=40,
                       batch_size=10,
                       verbose=0
                     )

    def predict(self, X_test_pad, y_test):
        predictions = self.model.predict(X_test_pad)
        predictions = predictions.reshape(-1, 1)
        predictions[predictions >= 0.5] = 1
        predictions[predictions < 0.5] = 0
        return accuracy_report(y_test, predictions)

## Cross validation for LSTM

In [0]:
corpus = df['normalized_text'].values
labels = df['HS'].values

X = corpus.copy()
y = labels

In [0]:
from sklearn.model_selection import KFold
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, shuffle=True)
count = 0
mean_acc = []
f1s = []
precis = []
recalls = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    train_text = [' '.join(t.split()[0:]) for t in x_train.tolist()]
    train_text = np.array(train_text, dtype=object)[:, np.newaxis]
    test_text = [' '.join(t.split()[0:]) for t in x_test.tolist()]
    test_text = np.array(test_text, dtype=object)[:, np.newaxis]

    model = LSTM_Model()
    model.fit(train_text, y_train)
    print('eps ', count + 1, ' fitting') 
    acc, f1, recall, precision = model.predict(test_text, y_test)
    mean_acc.append(acc)
    f1s.append(f1)
    precis.append(precision)
    recalls.append(recall)
    count += 1
    
    
print('LSTM Overall with ELMo')
print('Acc=', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('f1 score=', np.mean(f1s), ', std=', np.std(f1s))
print('Precision=', np.mean(precis), ', std=', np.std(precis))
print('Recalls=', np.mean(recalls), ', std=', np.std(recalls))

eps  1  fitting
Confusion Matrix : [[11  0]
 [ 2  7]]
                 
Accuracy: 0.9
Precision: 1.0
Recall: 0.7777777777777778
F1: 0.8750000000000001
eps  2  fitting
Confusion Matrix : [[6 4]
 [3 7]]
                 
Accuracy: 0.65
Precision: 0.6363636363636364
Recall: 0.7
F1: 0.6666666666666666
eps  3  fitting
Confusion Matrix : [[4 6]
 [2 8]]
                 
Accuracy: 0.6
Precision: 0.5714285714285714
Recall: 0.8
F1: 0.6666666666666666
eps  4  fitting
Confusion Matrix : [[5 6]
 [3 6]]
                 
Accuracy: 0.55
Precision: 0.5
Recall: 0.6666666666666666
F1: 0.5714285714285715
eps  5  fitting
Confusion Matrix : [[6 2]
 [6 6]]
                 
Accuracy: 0.6
Precision: 0.75
Recall: 0.5
F1: 0.6
LSTM Overall with ELMo
Acc= 0.66 , std= 0.12409673645990857
f1 score= 0.6759523809523811 , std= 0.10628865843336548
Precision= 0.6915584415584416 , std= 0.1747706494106755
Recalls= 0.6888888888888889 , std= 0.1063420987911591


# Universal Sentences Encoder

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

np.random.seed(1)
tf.random.set_random_seed(1)
tf.reset_default_graph()

In [0]:
dataset = pd.read_csv('/content/gdrive/My Drive/cleaned_text_v3.csv')
del dataset['Unnamed: 0']
dataset.head()

Unnamed: 0,id,text,HS,TR,AG,normalized_text
0,1,RT @vaintshit: shut the fuck up and come suck...,1,1,1,rt shut the fuck up and come suck my dick
1,2,@ArianasBotch Ok if you fucking said leave blo...,1,1,1,ok if you fucking said leave block me. but dm ...
2,3,@CyV_SW Wow mo cock got hard. Want to pull you...,1,1,1,wow mo cock got hard. want to pull your pantie...
3,4,Ill kill the bitch (chloe) when your not home ...,1,1,1,ill kill the bitch (chloe) when your not home ...
4,5,...............................'I get to rape ...,1,0,1,.'i get to rape beautiful women and that is wh...


## Define Universal encoder Module

In [0]:
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 
# module_url = "https://tfhub.dev/google/nnlm-en-dim128/1"
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 

In [0]:
def get_embedding(module_url, texts):
    embed = hub.Module(module_url)
    X = texts

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), 
                    tf.tables_initializer()])
        message_embeddings = session.run(embed(X))
    session.close()
    text_output = np.array(message_embeddings)
    
    return text_output

## LSTM

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM

LSTM_DIM = 1024 # total LSTM units

class LSTM_No_MC():
    def __init__(self):
        self._build_LSTM_model()

    def _build_LSTM_model(self):
        self.model = Sequential() 
        self.model.add(LSTM(LSTM_DIM,
                       recurrent_dropout=0.1)) 
        self.model.add(Dense(1024, activation='relu')) 
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', 
                            optimizer='adam', 
                            metrics=['accuracy'])
        return self
    
    def fit(self, train_X, train_y):
        self.model.fit(train_X, train_y, 
              epochs=20, 
              batch_size=4, 
              shuffle=True, 
              verbose=0)

    def predict(self, x_test):
        predictions = self.model.predict(x_test)
        return predictions
    
    def evaluate(self, x_test, y_test):
        predictions = self.predict(x_test)
        predictions[predictions >= 0.5] = 1
        predictions[predictions < 0.5] = 0
    
        acc, f1, recall, precision = accuracy_report(y_test, predictions)
        return acc, f1, recall, precision

### LSTM cross validation

In [0]:
def lstm_cv(X, y):
    kf = KFold(n_splits=5, random_state=42, 
               shuffle=True)
    count = 0
    mean_acc = []
    f1s = []
    recalls = []
    precis = []
    for train_index, test_index in kf.split(X, y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        input_length = x_train.shape[1]
        x_train = np.reshape(x_train,
                                (x_train.shape[0], 1, 
                                 x_train.shape[1]))

        x_test= np.reshape(x_test, (x_test.shape[0], 1, 
                                    x_test.shape[1]))
        print('*'*50)
        model = LSTM_No_MC()
        model.fit(x_train, y_train)

        # Prediction on the test set.
        print('*'*10)
        print('Evaluation model ', count + 1, '/5')
        count += 1
        # predictions = model.predict(x_test)

        acc, f1, recal, precision = model.evaluate(x_test, y_test)

        mean_acc.append(acc)
        f1s.append(f1)
        recalls.append(recal)
        precis.append(precision)

    print('Acc = ', np.mean(mean_acc), ', std = ', np.std(mean_acc))
    print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
    print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
    print('Precisions=', np.mean(precis), ', std=', np.std(precis))

In [0]:
texts = dataset['normalized_text'].values
X = get_embedding(module_url, texts)
y = dataset['HS'].values

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0608 09:19:53.800340 140007620867968 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [0]:
lstm_cv(X, y)

**************************************************
**********
Evaluation model  1 /5
Confusion Matrix : [[5 6]
 [4 5]]
Accuracy :  0.5
Sensitivity :  0.45454545454545453
Specificity :  0.5555555555555556
                 
Accuracy: 0.5
Precision: 0.45454545454545453
Recall: 0.5555555555555556
F1: 0.5
**************************************************
**********
Evaluation model  2 /5
Confusion Matrix : [[5 5]
 [1 9]]
Accuracy :  0.7
Sensitivity :  0.5
Specificity :  0.9
                 
Accuracy: 0.7
Precision: 0.6428571428571429
Recall: 0.9
F1: 0.75
**************************************************
**********
Evaluation model  3 /5
Confusion Matrix : [[6 4]
 [1 9]]
Accuracy :  0.75
Sensitivity :  0.6
Specificity :  0.9
                 
Accuracy: 0.75
Precision: 0.6923076923076923
Recall: 0.9
F1: 0.7826086956521738
**************************************************
**********
Evaluation model  4 /5
Confusion Matrix : [[9 2]
 [4 5]]
Accuracy :  0.7
Sensitivity :  0.8181818181818182
S

## Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
np.random.seed(1)

params = {
        'C': [0.01, 0.1, 1., 10, 100, 200],
        'solver': ['liblinear', 'lbfgs'],
        'penalty': ['l2']
}
kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)
texts = dataset['normalized_text'].values
y = dataset['HS'].values

#### New results

In [0]:
count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

best_params = None
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print('*'*50)
    if best_params is None:
        logistic = LogisticRegression(random_state=42, max_iter=500)
        random_search = RandomizedSearchCV(logistic, random_state=42,
                                           param_distributions=params,
                                           cv=4)
        random_search.fit(x_train, y_train)
        best_params = random_search.best_params_
        print(best_params)
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    logistic.fit(x_train, y_train)
    
    print('Evaluation model ', count + 1, '/5')
    count += 1
    
    print('Evaluation results:')
    
    prediction = logistic.predict(x_test) # predicting on the validation set
    prediction_int = prediction.astype(np.int)
     
    acc, f1, recall, precision = accuracy_report(y_test, prediction_int)
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)

print('Acc = ', np.mean(mean_acc), ', std = ', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precisions=', np.mean(precis), ', std=', np.std(precis))

**************************************************
{'solver': 'liblinear', 'penalty': 'l2', 'C': 200}
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[5 6]
 [5 4]]
                 
Accuracy: 0.45
Precision: 0.4
Recall: 0.4444444444444444
F1: 0.4210526315789474
**************************************************
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[9 1]
 [4 6]]
                 
Accuracy: 0.75
Precision: 0.8571428571428571
Recall: 0.6
F1: 0.7058823529411764
**************************************************
Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[7 3]
 [1 9]]
                 
Accuracy: 0.8
Precision: 0.75
Recall: 0.9
F1: 0.8181818181818182
**************************************************
Evaluation model  4 /5
Evaluation results:
Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
**********************************************



#### Old results

In [0]:
count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print('*'*50)
    logistic = LogisticRegression(random_state=42, max_iter=500)
    random_search = RandomizedSearchCV(logistic, random_state=42,
                                       param_distributions=params,
                                       cv=4)
    random_search.fit(x_train, y_train)
    best_params = random_search.best_params_
    print(best_params)
    logistic = LogisticRegression(solver=best_params['solver'],
                                 C=best_params['C'], penalty='l2',
                                 random_state=42)
    logistic.fit(x_train, y_train)
    
    print('Evaluation model ', count + 1, '/5')
    count += 1
    
    print('Evaluation results:')
    
    prediction = logistic.predict(x_test) # predicting on the validation set
    prediction_int = prediction.astype(np.int)
     
    acc, f1, recall, precision = accuracy_report(y_test, prediction_int)
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)

print('Acc = ', np.mean(mean_acc), ', std = ', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precisions=', np.mean(precis), ', std=', np.std(precis))

**************************************************




{'solver': 'liblinear', 'penalty': 'l2', 'C': 200}
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[5 6]
 [5 4]]
                 
Accuracy: 0.45
Precision: 0.4
Recall: 0.4444444444444444
F1: 0.4210526315789474
**************************************************
{'solver': 'lbfgs', 'penalty': 'l2', 'C': 10}
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[9 1]
 [4 6]]
                 
Accuracy: 0.75
Precision: 0.8571428571428571
Recall: 0.6
F1: 0.7058823529411764
**************************************************
{'solver': 'liblinear', 'penalty': 'l2', 'C': 100}
Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[7 3]
 [1 9]]
                 
Accuracy: 0.8
Precision: 0.75
Recall: 0.9
F1: 0.8181818181818182
**************************************************




{'solver': 'lbfgs', 'penalty': 'l2', 'C': 10}
Evaluation model  4 /5
Evaluation results:
Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
**************************************************
{'solver': 'liblinear', 'penalty': 'l2', 'C': 200}
Evaluation model  5 /5
Evaluation results:
Confusion Matrix : [[3 5]
 [3 9]]
                 
Accuracy: 0.6
Precision: 0.6428571428571429
Recall: 0.75
F1: 0.6923076923076924
Acc =  0.66 , std =  0.12409673645990857
F1 =  0.6524848990019269 , std= 0.13130797847599593
Recalls =  0.65 , std= 0.15885392000588017
Precisions= 0.6728571428571428 , std= 0.15295724359227625




## SVM

In [0]:
from sklearn.svm import SVC
np.random.seed(1)

kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)
texts = dataset['normalized_text'].values
X = get_embedding(module_url, texts)
y = dataset['HS'].values

params = {
        'C': [0.01, 0.1, 1., 10, 100, 200],
        'kernel': ['rbf'],
        'gamma': ['auto', 0.01, 0.1, 1., 10., 100.]
}

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0608 09:20:13.010547 140007620867968 saver.py:1483] Saver not created because there are no variables in the graph to restore


#### New results

In [0]:
count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

best_params = None

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    if best_params is None:
        print('*'*50)
        svm = SVC()

        random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                           cv=3, random_state=42)
        random_search.fit(x_train, y_train)
        best_params = random_search.best_params_
    
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    svm.fit(x_train, y_train)
    
    print('Evaluation model ', count + 1, '/5')
    count += 1
    
    print('Evaluation results:')
    
    prediction = svm.predict(x_test) # predicting on the validation set
    acc, f1, recall, precision = accuracy_report(y_test, prediction)
    
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)

print('Acc = ', np.mean(mean_acc), ', std = ', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precisions=', np.mean(precis), ', std=', np.std(precis))

**************************************************
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[5 6]
 [5 4]]
                 
Accuracy: 0.45
Precision: 0.4
Recall: 0.4444444444444444
F1: 0.4210526315789474
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[9 1]
 [4 6]]
                 
Accuracy: 0.75
Precision: 0.8571428571428571
Recall: 0.6
F1: 0.7058823529411764
Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[7 3]
 [1 9]]
                 
Accuracy: 0.8
Precision: 0.75
Recall: 0.9
F1: 0.8181818181818182
Evaluation model  4 /5
Evaluation results:
Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
Evaluation model  5 /5
Evaluation results:
Confusion Matrix : [[4 4]
 [3 9]]
                 
Accuracy: 0.65
Precision: 0.6923076923076923
Recall: 0.75
F1: 0.7199999999999999
Acc =  0.67 , std =  0.12083045973594572
F1 =  0.6580233605403883 , std= 0



#### Old results

In [0]:
count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

for train_index, test_index in kf.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print('*'*50)
    svm = SVC()
    
    random_search = RandomizedSearchCV(svm, param_distributions=params, 
                                       cv=3, random_state=42)
    random_search.fit(x_train, y_train)
    best_params = random_search.best_params_
    
    svm = SVC(random_state=42, C=best_params['C'], 
             kernel=best_params['kernel'],
             gamma=best_params['gamma'])
    svm.fit(x_train, y_train)
    
    print('Evaluation model ', count + 1, '/5')
    count += 1
    
    print('Evaluation results:')
    
    prediction = svm.predict(x_test) # predicting on the validation set
    acc, f1, recall, precision = accuracy_report(y_test, prediction)
    
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)

print('Acc = ', np.mean(mean_acc), ', std = ', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precisions=', np.mean(precis), ', std=', np.std(precis))

**************************************************
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[5 6]
 [5 4]]
                 
Accuracy: 0.45
Precision: 0.4
Recall: 0.4444444444444444
F1: 0.4210526315789474




**************************************************
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[9 1]
 [4 6]]
                 
Accuracy: 0.75
Precision: 0.8571428571428571
Recall: 0.6
F1: 0.7058823529411764
**************************************************




Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[7 3]
 [1 9]]
                 
Accuracy: 0.8
Precision: 0.75
Recall: 0.9
F1: 0.8181818181818182
**************************************************
Evaluation model  4 /5
Evaluation results:




Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
**************************************************
Evaluation model  5 /5
Evaluation results:
Confusion Matrix : [[4 4]
 [3 9]]
                 
Accuracy: 0.65
Precision: 0.6923076923076923
Recall: 0.75
F1: 0.7199999999999999
Acc =  0.67 , std =  0.12083045973594572
F1 =  0.6580233605403883 , std= 0.1334376163398494
Recalls =  0.65 , std= 0.15885392000588017
Precisions= 0.6827472527472528 , std= 0.15229500605237895


## LSTM MC Dropout

In [0]:
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.regularizers import l2
from keras import backend as K
np.random.seed(1)

class LSTM_MC:
    def __init__(self, input_length):
        self.model = self.build()
        
    def build(self):
        print('Build model...')
        p_W, p_U, p_dense, p_emb = 0.75, 0.75, 0.5, 0.5
        weight_decay, batch_size, maxlen = 1e-4, 10, 500
        model = Sequential()
        model.add(LSTM(1024, input_shape=(1, input_length),
                       kernel_regularizer=l2(weight_decay), 
                       recurrent_regularizer=l2(weight_decay),
                       dropout=p_W, 
                       recurrent_dropout=p_U))
        model.add(Dropout(p_dense))
        model.add(Dense(1024, 
                        kernel_regularizer=l2(weight_decay), 
                        activation='relu'
                       ))
        model.add(Dropout(p_dense))
        model.add(Dense(1, kernel_regularizer=l2(weight_decay), 
                        activation='sigmoid'
                       ))
        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['acc'])
        return model
    
    def fit(self, train_X, train_y):
        input_length = train_X.shape[1]
        train_X = np.reshape(train_X,
                                (train_X.shape[0], 1, train_X.shape[1]))
        self.model.fit(train_X, train_y, 
              batch_size=4, 
              shuffle=True,
              epochs=50,
              verbose=0)

    def predict(self, test_X):
        input_length = test_X.shape[1]
        test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))
        T = 1000
        predict_stochastic = K.function([self.model.layers[0].input,
                                        K.learning_phase()],
                                        [self.model.layers[-1].output])
        Yt_hat = np.array([predict_stochastic([test_X, 1]) for _ in range(T)])
        MC_pred = np.mean(Yt_hat, axis=0)
        MC_pred = MC_pred.reshape(-1, 1)
        MC_pred[MC_pred >= 0.5] = 1
        MC_pred[MC_pred < 0.5] = 0
        return MC_pred

In [0]:
texts = dataset['normalized_text'].values
X = get_embedding(module_url, texts)
y = dataset['HS'].values

In [0]:
kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)

count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('*'*50)
    input_length = x_train.shape[1]
    model = LSTM_MC(input_length)
    model.fit(x_train, y_train)
    print('*'*10)
    print('Evaluation model ', count + 1, '/5')
    count += 1
    predictions = model.predict(x_test)
    print('Evaluation results:')
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)
    
print('Acc = ', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precision=', np.mean(precis),', std=', np.std(precis))

**************************************************
Build model...
**********
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[7 4]
 [5 4]]
Accuracy :  0.55
Sensitivity :  0.6363636363636364
Specificity :  0.4444444444444444
                 
Accuracy: 0.55
Precision: 0.5
Recall: 0.4444444444444444
F1: 0.47058823529411764
**************************************************
Build model...
**********
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[8 2]
 [2 8]]
Accuracy :  0.8
Sensitivity :  0.8
Specificity :  0.8
                 
Accuracy: 0.8
Precision: 0.8
Recall: 0.8
F1: 0.8000000000000002
**************************************************
Build model...
**********
Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[6 4]
 [1 9]]
Accuracy :  0.75
Sensitivity :  0.6
Specificity :  0.9
                 
Accuracy: 0.75
Precision: 0.6923076923076923
Recall: 0.9
F1: 0.7826086956521738
**************************************************
Build model...

## LSTM MC Dropout TEST

In [0]:
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.regularizers import l2
from keras import backend as K
np.random.seed(1)

class LSTM_MC:
    def __init__(self, input_length):
        self.model = self.build()
        
    def build(self):
        print('Build model...')
        p_W, p_U, p_dense = 0.85, 0.5, 0.5
        weight_decay, batch_size, maxlen = 1e-4, 10, 500
        model = Sequential()
        model.add(LSTM(1024, input_shape=(1, input_length),
                       kernel_regularizer=l2(weight_decay), 
                       bias_regularizer=l2(weight_decay),
                       recurrent_regularizer=l2(weight_decay),
                       dropout=p_W, 
                       recurrent_dropout=p_U,
                       activation='tanh'))
        model.add(Dense(1024, 
                        kernel_regularizer=l2(weight_decay), 
                       bias_regularizer=l2(weight_decay),
                        activation='relu'
                       ))
        model.add(Dropout(p_dense))
        model.add(Dense(1, 
                        kernel_regularizer=l2(weight_decay), 
                        bias_regularizer=l2(weight_decay),
                        activation='sigmoid'
                       ))
        model.compile(loss='binary_crossentropy', 
                      optimizer='rmsprop',
                      metrics=['acc'])
        return model
    
    def fit(self, train_X, train_y):
        input_length = train_X.shape[1]
        train_X = np.reshape(train_X,
                                (train_X.shape[0], 1, train_X.shape[1]))
        self.model.fit(train_X, train_y, 
              batch_size=2, 
              shuffle=True,
              epochs=35,
              verbose=0)

    def predict(self, test_X):
        input_length = test_X.shape[1]
        test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))
        T = 1000
        predict_stochastic = K.function([self.model.layers[0].input,
                                        K.learning_phase()],
                                        [self.model.layers[-1].output])
        Yt_hat = np.array([predict_stochastic([test_X, 1]) for _ in range(T)])
        MC_pred = np.mean(Yt_hat, axis=0)
        MC_pred = MC_pred.reshape(-1, 1)
        MC_pred[MC_pred >= 0.5] = 1
        MC_pred[MC_pred < 0.5] = 0
        return MC_pred

In [0]:
texts = dataset['normalized_text'].values
X = get_embedding(module_url, texts)
y = dataset['HS'].values

Instructions for updating:
Colocations handled automatically by placer.


W0326 21:48:09.646178 140181877933952 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0326 21:48:13.652585 140181877933952 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [0]:
np.random.seed(1)
kf = KFold(n_splits=5, random_state=42, 
           shuffle=True)

count = 0
mean_acc = []
f1s = []
recalls = []
precis = []

for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('*'*50)
    input_length = x_train.shape[1]
    model = LSTM_MC(input_length)
    model.fit(x_train, y_train)
    print('Evaluation model ', count + 1, '/5')
    count += 1
    predictions = model.predict(x_test)
    print('Evaluation results:')
    acc, f1, recall, precision = accuracy_report(y_test, predictions)
    
    mean_acc.append(acc)
    f1s.append(f1)
    recalls.append(recall)
    precis.append(precision)
    
print('Acc = ', np.mean(mean_acc), ', std=', np.std(mean_acc))
print('F1 = ', np.mean(f1s), ', std=', np.std(f1s))
print('Recalls = ', np.mean(recalls), ', std=', np.std(recalls))
print('Precision=', np.mean(precis),', std=', np.std(precis))

**************************************************
Build model...
Evaluation model  1 /5
Evaluation results:
Confusion Matrix : [[6 5]
 [5 4]]
                 
Accuracy: 0.5
Precision: 0.4444444444444444
Recall: 0.4444444444444444
F1: 0.4444444444444444
**************************************************
Build model...
Evaluation model  2 /5
Evaluation results:
Confusion Matrix : [[6 4]
 [1 9]]
                 
Accuracy: 0.75
Precision: 0.6923076923076923
Recall: 0.9
F1: 0.7826086956521738
**************************************************
Build model...
Evaluation model  3 /5
Evaluation results:
Confusion Matrix : [[ 5  5]
 [ 0 10]]
                 
Accuracy: 0.75
Precision: 0.6666666666666666
Recall: 1.0
F1: 0.8
**************************************************
Build model...
Evaluation model  4 /5
Evaluation results:
Confusion Matrix : [[9 2]
 [4 5]]
                 
Accuracy: 0.7
Precision: 0.7142857142857143
Recall: 0.5555555555555556
F1: 0.6250000000000001
*******************

ResourceExhaustedError: ignored