In [None]:
import logging
import multiprocessing
import os

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Enable gensim logging
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


class W2VLossLogger(CallbackAny2Vec):
    """Callback to print loss after each epoch
    use by passing model.train(..., callbacks=[W2VLossLogger()])
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


def train_w2v_model(
    sentences,
    output_file,
    window,
    embedding_dim,
    epochs,
    min_word_count,
):
    

    """Train a word2vec model based on given sentences.
    Args:
        sentences list[list[str]]: List of sentences. Each element contains a list with the words
            in the current sentence
        output_file (str): Path to save the trained w2v model
        window (int): w2v context size
        embedding_dim (int): w2v vector dimension
        epochs (int): How many epochs should the training run
        min_word_count (int): Ignore words that appear less than min_word_count times
    """
    workers = multiprocessing.cpu_count()
    
    # TODO: Instantiate gensim.models.Word2Vec class
    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=window, min_count=min_word_count, workers=multiprocessing.cpu_count())
    model.build_vocab(sentences, progress_per=10000)
    # TODO: Build model vocabulary using sentences
    # TODO: Train word2vec model
    model.train(sentences, total_examples=model.corpus_count,epochs=epochs)
    # Save trained model
    model.save(output_file)
    # model.save(output_file)

    

    return model
    




In [None]:
   # read data/gutenberg.txt in the expected format (tokenized)
    f=open("../data/tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.hundd.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    
    #Initialize training of our Word2Vec model
    
    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)


In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

gutenberg_w2v = Word2Vec.load("gutenberg_w2v.hundd.model")

In [None]:
gutenberg_w2v.wv.most_similar(["bible"])


In [None]:
gutenberg_w2v.wv.most_similar(["book"])


In [None]:
gutenberg_w2v.wv.most_similar(["bank"])


In [None]:
gutenberg_w2v.wv.most_similar(["water"])

In [None]:
v = gutenberg_w2v.wv["good"] - gutenberg_w2v.wv["taller"] + gutenberg_w2v.wv["tall"]
gutenberg_w2v.wv.most_similar(v)

In [None]:
v = gutenberg_w2v.wv["girls"] - gutenberg_w2v.wv["queen"] + gutenberg_w2v.wv["kings"]
gutenberg_w2v.wv.most_similar(v)

In [None]:
v = gutenberg_w2v.wv["france"] - gutenberg_w2v.wv["paris"] + gutenberg_w2v.wv["london"]
gutenberg_w2v.wv.most_similar(v)

In [None]:
from gensim.models import KeyedVectors

google_model = KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

In [None]:
google_model.most_similar(["bible"])

In [None]:
google_model.most_similar(["book"])

In [None]:
google_model.most_similar(["bank"])

In [None]:
google_model.most_similar(["water"])

In [None]:
v = google_model["girls"] - google_model["queen"] + google_model["kings"]
google_model.most_similar(v)

In [None]:
v = google_model["good"] - google_model["taller"] + google_model["tall"]
google_model.most_similar(v)

In [None]:
v = google_model["france"] - google_model["paris"] + google_model["london"]
google_model.most_similar(v)

In [None]:
import numpy as np

# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(voc), model.vector_size))
    word2idx = {}
    for i in range(len(voc)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings=to_embeddings_Matrix(gutenberg_w2v)
print(np.shape(embeddings))


In [None]:
# Put it in data folder the embeddings and the metadata to load into the data visualization tool

import csv
with open('../data/embeddings.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for embedding in embeddings:
        tsv_output.writerow(embedding)
    
with open('../data/metadata.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output)
    for voc_rows in voc:
        tsv_output.writerow([voc_rows])
  

In [None]:
import glob
import os
import re

import numpy as np
import sklearn

SCRIPT_DIRECTORY = os.path.realpath(os.getcwd())

data_dir = os.path.join(SCRIPT_DIRECTORY, "../data/aclImdb")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
pos_train_dir = os.path.join(train_dir, "pos")
neg_train_dir = os.path.join(train_dir, "neg")
pos_test_dir = os.path.join(test_dir, "pos")
neg_test_dir = os.path.join(test_dir, "neg")

# For memory limitations. These parameters fit in 8GB of RAM.
# If you have 16G of RAM you can experiment with the full dataset / W2V
MAX_NUM_SAMPLES = 5000
# Load first 1M word embeddings. This works because GoogleNews are roughly
# sorted from most frequent to least frequent.
# It may yield much worse results for other embeddings corpora
NUM_W2V_TO_LOAD = 1000000


SEED = 42

# Fix numpy random seed for reproducibility
np.random.seed(SEED)


def strip_punctuation(s):
    return re.sub(r"[^a-zA-Z\s]", " ", s)


def preprocess(s):
    return re.sub("\s+", " ", strip_punctuation(s).lower())


def tokenize(s):
    return s.split(" ")


def preproc_tok(s):
    return tokenize(preprocess(s))


# Preprocess and tokenize the reviews, it will come out as list of lists 
def token_proc(t_corpus):
    data=[]
    for i,ind in enumerate(t_corpus):
        proc_t_corpus=preproc_tok(train_corpus[i])
        data.append(proc_t_corpus)
    
    return data


def read_samples(folder, preprocess=lambda x: x):
    samples = glob.iglob(os.path.join(folder, "*.txt"))
    data = []

    for i, sample in enumerate(samples):
        if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES:
            break
        with open(sample, "r") as fd:
            x = [preprocess(l) for l in fd][0]
            data.append(x)

    return data


def create_corpus(pos, neg):
    corpus = np.array(pos + neg)
    y = np.array([1 for _ in pos] + [0 for _ in neg])
    indices = np.arange(y.shape[0])
    np.random.shuffle(indices)

    return list(corpus[indices]), list(y[indices])


def extract_nbow(model,train_data,test_data):
    """Extract neural bag of words representations"""
    
    # The training dataset (sentences of the reviews) will be converted to vectors of 100 dimensions 
    X_train = np.zeros((np.size(train_data), 100))
    for row, rev in enumerate(train_data):
        words_included = 0

        # Tokenize current review
        rev_toks = preproc_tok(rev)
    
        for tok in rev_toks:
            if tok in model.wv:
                X_train[row] += model.wv[tok]
                words_included += 1
            
        # Get the mean value of each sentence in the embedding space
        X_train[row] = X_train[row]/words_included



    # The test dataset (sentences of the reviews) will be converted to vectors of 100 dimensions 
    X_test = np.zeros((np.size(test_data), 100)) 
    for row, rev in enumerate(test_data):
        words_included = 0
        
        # Tokenize current review
        rev_toks = preproc_tok(rev)
        for tok in rev_toks:
            # For each token check if it has a w2v representation
            # and if yes add it.
            if tok in model.wv:
                X_test[row] += model.wv[tok]
                words_included += 1
                
        # Get the mean value of each sentence in the embedding space
        X_test[row] = X_test[row]/words_included

    return X_train,X_test

    raise NotImplementedError("Implement nbow extractor")


# def train_sentiment_analysis(train_corpus, train_labels):
#     """Train a sentiment analysis classifier using NBOW + Logistic regression"""
#     raise NotImplementedError("Implement sentiment analysis training")


# def evaluate_sentiment_analysis(classifier, test_corpus, test_labels):
#     """Evaluate classifier in the test corpus and report accuracy"""
#     raise NotImplementedError("Implement sentiment analysis evaluation")




In [None]:
# # Positive and negative reviews train dataset
# pos_train=read_samples(pos_train_dir)
# neg_train=read_samples(neg_train_dir)

# # Positive and negative reviews test dataset
# pos_test=read_samples(pos_test_dir)
# neg_test=read_samples(neg_test_dir)
    
    


In [None]:
proc_tok_corpus=[]
proc_tok_rev=[]

for rev in corpus:
    proc_tok_rev=preproc_tok(rev)

    proc_tok_corpus.append(proc_tok_rev)


In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

train_w2v_model(proc_tok_corpus,
   "my_sentiment_w2v.model",
    5,
    100,
    1000,
    1,
)

In [None]:
import numpy as np


my_sentiment_w2v = Word2Vec.load("my_sentiment_w2v.model")


# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(voc), model.vector_size))
    word2idx = {}
    for i in range(len(voc)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings_my_sentiment=to_embeddings_Matrix(my_sentiment_w2v)

print(np.shape(embeddings_my_sentiment))

In [None]:
train_data,train_labels = create_corpus(read_samples(pos_train_dir), read_samples(neg_train_dir))
test_data,test_labels = create_corpus(read_samples(pos_test_dir), read_samples(neg_test_dir))


X_train,X_test=extract_nbow(my_sentiment_w2v,train_data,test_data)

In [None]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression().fit(X_train,train_labels)


In [None]:
from sklearn import metrics
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred,test_labels))



In [None]:
from gensim.models import KeyedVectors
google_sentiment_w2v = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

embeddings_google_sentiment = np.zeros((len(voc), google_sentiment_w2v.vector_size))
word2idx = {}
for i in range(len(voc)):
    embeddings_google_sentiment[i] = google_sentiment_w2v[google_sentiment_w2v.index_to_key[i]] 

print(np.shape(embeddings_google_sentiment))

In [None]:
train_data,train_labels=create_corpus(read_samples(pos_train_dir), read_samples(neg_train_dir))
test_data,test_labels= create_corpus(read_samples(pos_test_dir), read_samples(neg_test_dir))


X_google_train = np.zeros((np.size(train_data), 300))
for row, rev in enumerate(train_data):
    words_included = 0
    rev_toks = preproc_tok(rev)
    
    for tok in rev_toks:
        if tok in google_sentiment_w2v:
            X_google_train[row] += google_sentiment_w2v[tok]
            words_included += 1

    # Get the mean value
    X_google_train[row] = X_google_train[row]/words_included




X_google_test = np.zeros((np.size(test_data), 300)) 
for row, rev in enumerate(test_data):
    words_included = 0
    # Tokenize current review
    rev_toks = preproc_tok(rev)
    for tok in rev_toks:
        # For each token check if it has a w2v representation
        # and if yes add it.
        if tok in google_sentiment_w2v:
            X_google_test[row] += google_sentiment_w2v[tok]
            words_included += 1
    # Get the mean value
    X_google_test[row] = X_google_test[row]/words_included


In [None]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression().fit(X_google_train,train_labels)


In [None]:
from sklearn import metrics
y_pred = clf.predict(X_google_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred,test_labels))
