In [None]:
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle

In [None]:
from utils.preprocess import Preprocess
from utils.loader import load_data

In [None]:
movie_path = ['movie_train','movie_val','movie_test']
flight_path = ['flight_train','flight_val','flight_test']
laptop_path = ['laptop_train','laptop_val','laptop_test']

Preprocess(moive_path)

movie = load_data(movie_path)

In [None]:
vectorizer = CountVectorizer(token_pattern = r"\w+",min_df = 10,lowercase = False)
vectorized_train = vectorizer.fit_transform(movie.train.data)
dictionary = vectorizer.get_feature_names()

### 1.word2vec

In [None]:
w2v = word2vec.Word2Vec([line.split() for line in movie.train.data],
                        size = 50,window = 5,min_count = 10,workers = 8)

In [None]:
def get_vec(src,size):
    vec = []
    for line in src:
        tmp = np.zeros(size)
        for word in line.split():
            if word in dictionary:
                tmp += w2v.wv[word]
        vec.append(tmp*100)
    vec = np.array(vec)
    return vec

vec_train = get_vec(movie.train.data,50)
vec_val = get_vec(movie.val.data,50)
vec_test = get_vec(movie.test.data,50)

In [None]:
logistic = LogisticRegression(max_iter = 1,solver='saga',warm_start = True)
for epoch in range(500):
    logistic.fit(vec_train,np.array(movie.train.labels))
    pred_val = logistic.predict(vec_val)
    if (1+epoch)%50 == 0:
        print(sum(pred_val == movie.val.labels)/len(movie.val.labels))

In [None]:
pred_test = logistic.predict(vec_test)
print(classification_report(movie.test.labels,pred_test,digits = 5))

### 2.TF-IDF

In [None]:
transformer = TfidfTransformer()
tfidf_train = transformer.fit_transform(vectorized_train).toarray()
tfidf_val = transformer.transform(vectorizer.transform(movie.val.data)).toarray()
tfidf_test = transformer.transform(vectorizer.transform(movie.test.data)).toarray()

In [None]:
logistic = LogisticRegression(max_iter = 1,solver='saga',warm_start = True)
for epoch in range(500):
    logistic.fit(tfidf_train,np.array(movie.train.labels))
    pred_val = logistic.predict(tfidf_val)
    if (1+epoch)%50 == 0:
        print(sum(pred_val == movie.val.labels)/len(movie.val.labels))

In [None]:
pred_test = logistic.predict(tfidf_test)
print(classification_report(movie.test.labels,pred_test,digits = 5))

### 3.TFIDF2vec

In [None]:
transformer = TfidfTransformer(norm = None)
tfidf_train = transformer.fit_transform(vectorized_train).toarray()
tfidf_val = transformer.transform(vectorizer.transform(movie.val.data)).toarray()
tfidf_test = transformer.transform(vectorizer.transform(movie.test.data)).toarray()

In [None]:
def get_tfidf_vec(src,size):
    vec = []
    for line in src:
        tmp = np.zeros(size)
        support = np.nonzero(line)[0]
        for i in support:
            tmp += line[i]*w2v.wv[dictionary[i]]
        vec.append(tmp)
    vec = np.array(vec)
    return vec

tfidf_vec_train = get_tfidf_vec(tfidf_train,50)
tfidf_vec_val = get_tfidf_vec(tfidf_val,50)
tfidf_vec_test = get_tfidf_vec(tfidf_test,50)

In [None]:
logistic = LogisticRegression(max_iter = 1,solver='saga',warm_start = True)
for epoch in range(500):
    logistic.fit(tfidf_vec_train,np.array(movie.train.labels))
    pred_val = logistic.predict(tfidf_vec_val)
    if (1+epoch)%50 == 0:
        print(sum(pred_val == movie.val.labels)/len(movie.val.labels))

In [None]:
pred_test = logistic.predict(tfidf_vec_test)
print(classification_report(movie.test.labels,pred_test,digits = 5))