In [10]:
import spacy
import re
import numpy as np
import pandas as pd
from collections import Counter
import nltk
import gensim
#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score, classification_report

from lib.processor import *

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
df = pd.read_csv("dataset.csv", sep = ",")

In [6]:
BASE_DIR = os.path.dirname(os.path.realpath('__file__'))

## Wei Jie Word2Vec Feature Extraction

In [7]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [8]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list ])

## Download pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

In [11]:
#load pretrained model
wv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(BASE_DIR,"models","GoogleNews-vectors-negative300.bin.gz"), binary=True)
wv.init_sims(replace=True)

## Preprocess Text: Multiple rows
### Use function below for single row of text 
preprocess_text(text, True, True, False, True, True) 


In [None]:
df.text = df.text.map(lambda x: preprocess_text(x, True, True, False, True, True))

## Multiple Rows of tokenization
### Use function below for single row of text
w2v_tokenize_text(text)

In [None]:
data_tokenized = df.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

## Obtain average word vectors
### Use function below for single row of tokenized text
word_averaging(wv, tokenized_text)

In [None]:
data_word_average = word_averaging_list(wv, data_tokenized)