In [1]:
import json
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [56]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikhail/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Preprocessing

In [2]:
path_data_prepaired = '../dataset/dataset.json'

In [3]:
data = None
with open(path_data_prepaired) as file_data:
    data = json.load(file_data)

In [4]:
from preprocessing import clear_sentences

In [5]:
%time sentences = clear_sentences(data)

CPU times: user 4.41 s, sys: 291 ms, total: 4.7 s
Wall time: 4.7 s


## 1) Words Embeddings as Matrix Factorization

In [6]:
from models import Word2Vec

In [7]:
model = Word2Vec(sentences)

In [8]:
model.create_vocabulary()
model.create_corpus_matrix()
model.compute_embeddings(5)

Creating vocabulary
Creating corpus matrix
Computing of words embeddings


In [9]:
model.W.shape

(3723, 200)

In [10]:
##### Compute review embeddings #####
def get_review_embedding(model, review):
    """
    model -- word2vec model instance, which is used
    review -- current review to be embedded
    """
    
    review_vec = np.zeros(model.d)
    words_count = 0
    stops = set(stopwords.words("english"))
    
    for word in review:
        if (word in model.vocab) and not (word in stops):
            review_vec += model.get_word_embedding(word)
            words_count += 1
    review_vec /= words_count
    return review_vec

In [11]:
##### Compute review embeddings #####
def get_features_matrix(model, reviews):
    """
    model -- word2vec model instance, which is used
    reviews -- the whole collection of reviews
    """
    X = np.zeros((len(reviews), model.d))
    for idx, review in enumerate(reviews):
        X[idx, :] = get_review_embedding(model, review)
    return X

In [12]:
X = get_features_matrix(model, sentences)
X.shape

  # This is added back by InteractiveShellApp.init_path()


(194439, 200)

### Comparison models of embeddings

In [43]:
# import clissifiers and necessary functions
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

In [15]:
# create dataframe from the json file
df = pd.read_json(path_data_prepaired)

In [23]:
# get labels
y = (df['overall'] > 3).apply(int)

In [24]:
# get indices of rows which contain NaNs
del_idx = np.argwhere(np.isnan(X))[:, 0]

In [27]:
# delete rows with NaNs
X = X[~np.isnan(X).any(axis=1)]
y = y.drop(del_idx)

In [30]:
y.shape, X.shape

((194286,), (194286, 200))

In [33]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

Try RF classifier out of the box

In [34]:
rf_clf = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(rf_clf, X_train, y_train, n_jobs=-1, cv=5)

In [35]:
print(f'Accuracy score: {cv_scores.mean()} +/- {cv_scores.std()}')

Accuracy score: 0.7506433531930204 +/- 0.0014032604068035476


Evaluate model on the holdout set

In [38]:
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(f'Accuracy on holdout set: {accuracy_score(y_test, y_pred)}')



Accuracy on holdout set: 0.7517741558137722


In [44]:
params = {'n_estimators': list(range(5, 51, 5)), 'max_depth': list(range(5, 100, 5)), 
         'min_samples_split': list(range(1, 11, 1))}

clf = RandomizedSearchCV(RandomForestClassifier(), params, n_jobs=-1, 
                         cv=5, verbose=1, random_state=42)

In [45]:
clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

xgboost is very slow on mac ....

In [41]:
xgbm_cls = XGBClassifier(random_state=42)
cv_scores = cross_val_score(xgbm_cls, X_train, y_train, n_jobs=-1, cv=5)

KeyboardInterrupt: 

In [None]:
print(f'Accuracy score: {cv_scores.mean()} +/- {cv_scores.std()}')

In [46]:
X_df = pd.DataFrame(X)

In [49]:
X_df.to_csv('../dataset/X_1.csv')
y.to_csv('../dataset/y_1.csv')