### Imports

In [None]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.model_selection import train_test_split

from transformers import *
import torch
import keras

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils

### Definitions

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
def process_df(df):
    df['text'] = my_utils.preprocess(df['text'])
    return df

### Start

In [None]:
dataset_movies = getDF('datasets_raw/reviews_Movies_and_TV_5.json.gz')
dataset_movies.shape

In [None]:
dataset_home = getDF('datasets_raw/reviews_Home_and_Kitchen_5.json.gz')
dataset_home.shape

In [None]:
dataset_kindle = getDF('datasets_raw/reviews_Kindle_Store_5.json.gz')
dataset_kindle.shape

In [None]:
dataset = pd.concat([dataset_movies, dataset_home, dataset_kindle])

In [None]:
dataset = dataset.drop(columns=['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'])

In [None]:
dataset = dataset.rename(columns={'reviewText': 'text', 'overall': 'sentiment'})

In [None]:
dataset.shape

In [None]:
n_cores = 30

In [None]:
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

In [None]:
# vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
#                              stop_words="english", max_features=max_features,
#                              max_df=max_df, min_df=min_df)

In [None]:
# wordOccurenceMatrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()

In [None]:
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

In [None]:
model_gensim = FT_gensim(size=100)

In [None]:
model_gensim.build_vocab(sentences=dataset.text)

In [None]:
model_gensim.epochs, model_gensim.corpus_count, model_gensim.corpus_total_words

In [None]:
model_gensim.train(epochs=model_gensim.epochs, total_examples=model_gensim.corpus_count, 
                   total_words=model_gensim.corpus_total_words, sentences=dataset.text)

In [None]:
model_gensim.save('saved_model_gensim')

In [None]:
loaded_model = FT_gensim.load('saved_model_gensim')

In [None]:
loaded_model['hello']

In [None]:
loaded_model['hi']