In [4]:
import warnings, numpy as np, re, json, gnumpy as gpu, pandas as pd
from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim, unidecode, math
from googletrans import Translator
from gensim.models import doc2vec
import  nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility

## Loading Reviews and Labels

In [3]:
## TODO REMOVE SAMPLING AND WORK ON ALL DATA
import io, random
with open('crawler/movies_data/en_reviews.json','r') as en:
    en_revs = random.sample(json.load(en), 500) # [movie id, [review, score]]
with io.open('crawler/movies_data/tr_reviews.json','r', encoding='utf-8') as tr:
    tr_revs = random.sample(json.load(tr), 500)

IOError: [Errno 2] No such file or directory: 'crawler/movies_data/en_reviews.json'

## Getting pretrained English Word Vectors

In [8]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"GoogleNews-vectors-negative300.bin", binary=True)

In [9]:
WORDS_DIMENSION = 300
stemmer = TurkishStemmer()
translator = Translator()

def document_vector(document):
    return doc2vec.Doc2Vec(document, size = 300, window = 300, min_count = 10, workers=4)
    
def paragraph_vector(document, vectDicts='en_vects', useGPU=False):
    x = np.zeros(WORDS_DIMENSION)
    for word in document:
        if word in globals()[vectDicts]:
            x += globals()[vectDicts][word]
#     x += document_vector(document)
#     paragraph_vec = x / (len(document)+1)
    paragraph_vec = x / (len(document))
    if useGPU:
        return gpu.garray(paragraph_vec)
    return np.array(paragraph_vec)

In [10]:
# nltk.download()

## Preparing Turkish Word Vectors

In [11]:
turkish_corpus = []
for rev in tr_revs:
    turkish_corpus.append(rev[1][0])
tok_corpus = [nltk.word_tokenize(sent.decode('utf-8'), language='turkish') for sent in turkish_corpus]
tr_vects = gensim.models.Word2Vec(tok_corpus, min_count=1, size=300)
tr_vects.save("tr_vects.gnsm")

In [12]:
tr_vects = gensim.models.Word2Vec.load("tr_vects.gnsm")

In [13]:
def clean(text, vects='en_vects'):
    if vects == 'tr_vects':
        l = list()
        for word in text.split(" "):
            w = stemmer.stem(word.lower().encode("utf-8"))
            if w in globals()[vects]:
                l.append(w)
        return l
    return [word.lower() for word in text.split(" ") if word in globals()[vects]]

In [14]:
len(en_revs)

500

In [15]:
def sigm(x, deriv=False, useGPU=False):
    if deriv:
        return x(1-x)
    if useGPU:
        return gpu.logistic(x)
    return 1/(1+np.exp(-x))

In [16]:
def parse_revs(loaded_data):
    e = dict()
    for r in loaded_data:
        mov_id, rev, score = r[0], r[1][0], r[1][1]
        e.setdefault(mov_id, {})
        e[mov_id].setdefault(score, [])
        e[mov_id][score].append(rev)
    return e

In [17]:
revs_dict_src = parse_revs(en_revs)  # {id: {score: [revs]}} >> For English
revs_dict_tgt = parse_revs(tr_revs)  # {id: {score: [revs]}} >> For Turkish

In [18]:
with open('revs_dict_src.json', 'w') as f:
    json.dump(revs_dict_src, f)
with open('revs_dict_tgt.json', 'w') as f:
    json.dump(revs_dict_tgt, f)

## Dataframing the dicts

In [26]:
def fill_dfraw(lang, datadict):
    df_raw = list()
    for movie in datadict:
        for score in datadict[movie]:
            for rev in datadict[movie][score]:
                df_raw.append((lang, movie,rev, score))
    return df_raw
df_raw = fill_dfraw("EN", revs_dict_src)
df_raw += fill_dfraw("TR", revs_dict_tgt)

In [28]:
df = pd.DataFrame(df_raw, columns=["Language","Movie_ID","Review","Score"])
df.
df.to_csv("movie_data.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score
0,EN,-1018312192,good interpretation of the actors # good speci...,9
1,EN,-1112579067,before watching the pursuit of happiness i did...,8
2,EN,-1084746746,while this movie may at most times be highly f...,9
3,EN,-836740088,gattaca refers to a fictional aerospace compan...,10
4,EN,148997333,admittedly i read one or two comments from the...,3


In [14]:
def get_long_translation(text, src='en', dest='tr'):
    start = 0
    step = int(len(text) / math.ceil(len(text)/5000.0))
    l = list()
    for x in xrange(start, len(text), step):
        l.append(re.sub(r'[^a-z0-9\s+]+', '', unidecode.unidecode(translator.translate(text[x:x+step], src=src, dest=dest).text).lower()))
    c = ' '.join(l)
    return c

In [27]:
def pick_sample(n=2000, movie_indx=0, rev_indx=0, useGPU=False):
    global revs_dict_src, revs_dict_tgt
    T = list()
    Tt = list()
    St = list()
    S = list()
    labels = list()
    while movie_indx < len(revs_dict_tgt):
        mov_id = revs_dict_tgt.keys()[movie_indx]
        if mov_id not in revs_dict_src:
            movie_indx += 1
            continue
        for score in revs_dict_tgt[mov_id]:
            if score not in revs_dict_src[mov_id]:
                continue
            tgt_revs = revs_dict_tgt[mov_id][score]
            src_revs = revs_dict_src[mov_id][score]
            for r in xrange(rev_indx, min(len(tgt_revs), len(src_revs))):
                print len(T)
                T.append(paragraph_vector(clean(tgt_revs[r], 'tr_vects')))
                Tt.append(paragraph_vector(clean(get_long_translation(tgt_revs[r], src='tr', dest='en'))))
                St.append(paragraph_vector(clean(get_long_translation(src_revs[r], src='en', dest='tr'), 'tr_vects')))
                S.append(paragraph_vector(clean(src_revs[r])))
                labels.append(score)
                if len(T) == n:
                    rev_indx = r + 1
                    if useGPU:
                        return gpu.garray(T), gpu.garray(Tt), gpu.garray(St), gpu.garray(S), movie_indx, rev_indx, gpu.garray(labels)
                    else:
                        return np.array(T), np.array(Tt), np.array(St), np.array(S), movie_indx, rev_indx, np.array(labels)
        movie_indx += 1
        rev_indx = 0
    if useGPU:
        return gpu.garray(T), gpu.garray(Tt), gpu.garray(St), gpu.garray(S), movie_indx, rev_indx, gpu.garray(labels)
    else:
        return np.array(T), np.array(Tt), np.array(St), np.array(S), movie_indx, rev_indx, np.array(labels)

In [5]:
T, Tt, St, S, movie_indx, rev_indx, labels = pick_sample(500)

NameError: name 'pick_sample' is not defined

In [26]:
len(T), len(S), len(Tt), len(St)

(134, 134, 134, 134)

In [None]:
from sklearn import cross_validation, svm
from sklearn.linear_model import LinearRegression

In [None]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(S, labels, test_size=0.1)

In [None]:
clf = LinearRegression()
clf.fit(x_train, y_train)

In [None]:
translator.translate('hello', dest='tr').text

In [None]:
clf.score(x_test, y_test)