In [1]:
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

In [5]:
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [6]:
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    embedding_model.init_sims(replace=True)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights[i] = embedding_model[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [7]:
def preprocess_df(df):
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1]
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [8]:
data_path = "Data/"

df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

df_train["text"] = df_train["review"]
df_test["text"] = df_test["review"]
df_train = preprocess_df(df_train)

df_test = preprocess_df(df_test)

tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]
embedding_weights = get_embeddings(inp_data, vocabulary_inv)


tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]

train_vec = []
for doc in tagged_train_data:
    vec = 0
    for w in doc:
        vec += embedding_weights[vocabulary[w]]
    vec = vec / len(doc)
    train_vec.append(vec)

test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    for w in doc:
        try:
            vec += embedding_weights[vocabulary[w]]
            length += 1
        except:
            continue
    vec = vec / length
    test_vec.append(vec)

clf = LogisticRegression(max_iter=100000000).fit(train_vec, df_train["label"])
preds = clf.predict(test_vec)

dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted.csv", index=False)

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding




In [14]:
train_vec[0]

array([ 0.05187569,  0.07635441,  0.0185177 ,  0.07069988, -0.01644366,
       -0.07307199, -0.14826442, -0.15318319, -0.03870092,  0.01208771,
        0.04295781, -0.05468978,  0.01027161, -0.04434469, -0.10359234,
       -0.02489309, -0.02115308,  0.05459466, -0.0199364 ,  0.02917993,
        0.08272176, -0.04944272,  0.00689092,  0.0334314 , -0.01661962,
       -0.02835405,  0.03777434, -0.03238566, -0.03536997,  0.0881428 ,
        0.05693689, -0.06150929,  0.12330765, -0.0092267 , -0.09279685,
       -0.08054416,  0.115674  ,  0.12240585, -0.03147379, -0.02998215,
       -0.11217732,  0.04644492, -0.0036658 ,  0.0528971 ,  0.1306516 ,
       -0.08707248,  0.01716317,  0.00494881, -0.01466959,  0.08002137,
        0.1077945 , -0.05920809,  0.08259955, -0.01092692, -0.04617465,
        0.00140286,  0.06613625, -0.00230198, -0.08665175,  0.01540626,
        0.04815543, -0.04479445,  0.09404107,  0.08037522,  0.00599573,
        0.05791441, -0.00473196, -0.02989393,  0.14082607, -0.03