In [139]:
import warnings, numpy as np, re, json, pandas as pd, pickle
try:
    import gnumpy as gpu
except ModuleNotFoundError:
    pass
from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim, math
from gensim.models import doc2vec
import  nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# from KaggleWord2VecUtility import KaggleWord2VecUtility

In [150]:
df = pd.read_csv("datasets/movie_data.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [39]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"GoogleNews-vectors-negative300.bin", binary=True)

In [13]:
tr_vects = gensim.models.Word2Vec.load("tr_vects.gnsm")

In [57]:
tr_vocabs_ = dict()
en_vocabs_ = dict()

In [58]:
stemmer = TurkishStemmer()
def tokenize(text, vects='en_vects'):
    if vects == 'tr_vects':
        tr_words_inreview = list()
        for word in text.split(" "):
            w = stemmer.stem(word.lower().encode("utf-8"))
            if w in globals()[vects] and len(w)>2:
                tr_vocabs_[w] = globals()[vects][w]
                tr_words_inreview.append(w)
        return tr_words_inreview
    en_words = list()
    for word in text.split(" "):
        w = word.lower()
        if w in globals()[vects] and len(w)>2:
            en_vocabs_[w] = globals()[vects][w]
            en_words.append(w)
    return en_words

In [151]:
df["tokenized_reviews"] = df[["Language", "Review"]].apply(lambda x: tokenize(x[1], x[0]+"_vects"), axis=1)

In [152]:
df[["tokenized_reviews"]].head(5)

Unnamed: 0,tokenized_reviews
0,"[love, science, fiction, hate, superheroes, bu..."
1,"[the, movie, absolutely, incredible, all, the,..."
2,"[cinematic, era, dominated, reboots, mindless,..."
3,"[movie, review, rise, the, planet, the, apes, ..."
4,"[during, experiments, find, cure, for, alzheim..."


In [153]:
df.to_csv("datasets/tokenized_reviews.csv", index=False)

In [68]:
print ("Turkish Vocab: %d words" %len(tr_vocabs_.keys()))
print ("Enlish Vocab: %d words" %len(en_vocabs_.keys()))

Turkish Vocab: 1299 words
Enlish Vocab: 10380 words


# Clustering Similar words in each language

## Getting vocabs to map them to their clusters

In [105]:
def get_vocabs_vects_XY(vocabs_dict):
    X = list()
    y = list()
    for word in vocabs_dict:
        X.append(vocabs_dict[word])
        y.append(word)
    return np.array(X),np.array(y)

In [106]:
X_en, y_en = get_vocabs_vects_XY(en_vocabs_)
X_tr, y_tr = get_vocabs_vects_XY(tr_vocabs_)

## K_Means clustering
### 1000 English clusters
### 300 Turkish clusters

In [2]:
from sklearn.cluster import k_means

In [116]:
en_clusters=k_means(X_en, n_clusters=1000, random_state=0)
tr_clusters=k_means(X_tr, n_clusters=300, random_state=0)

In [129]:
def word2cluster(vocab, clusters):
    # returns a dictionary of each word with its closest cluster
    word2cluster_dict = dict()
    centroids, labels = clusters[0], clusters[1]
    for word_index in range(len(vocab)):
        cluster_index = labels[word_index]
        word2cluster_dict[vocab[word_index]] = centroids[cluster_index]
    return word2cluster_dict

In [130]:
en_word2cluster = word2cluster(y_en, en_clusters)
tr_word2cluster = word2cluster(y_tr, tr_clusters)

## Dump word2cluster vectors to pickle file

In [142]:
pickle.dump(en_word2cluster, open("datasets/en_word2cluster.pickle", "w"))
pickle.dump(tr_word2cluster,open("datasets/tr_word2cluster.pickle", "w"))

In [146]:
df_vectorized = df.copy()
df_vectorized["index"] = df_vectorized.index
df_vectorized.set_index("index",inplace=True)
df_vectorized.head(5)

Unnamed: 0_level_0,Language,Movie_ID,Review,Score,cleaned_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,en,-800777728,i love science fiction and i hate superheroes ...,9,"[love, science, fiction, hate, superheroes, bu..."
1,en,-800777728,the movie is absolutely incredible all the per...,10,"[the, movie, absolutely, incredible, all, the,..."
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8,"[cinematic, era, dominated, reboots, mindless,..."
3,en,-1018312192,movie review on rise of the planet of the apes...,4,"[movie, review, rise, the, planet, the, apes, ..."
4,en,-1018312192,during experiments to find a cure for alzheime...,7,"[during, experiments, find, cure, for, alzheim..."
