# 1.Import necessary modules and set constant variables

In [None]:
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim
import nltk
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from joblib import dump
import seaborn as sns
nltk.download('wordnet')

project_path = os.path.join("..")
json_file = os.path.join(project_path,"Apps_for_Android_5.json")
SAVE_MODELS = False

# 2.Exploratory analysis
## Load the json and explore the form of it.

In [None]:
df = pd.read_json(json_file, lines = True)
print("There are "+ str(df.shape[0]) + " observations with " + str(df.shape[1]) + " columns each.")
print(df.head(10))
print("Total NaN values: " + str(df.isna().sum().sum()))
print(df.isna().sum())

## Get only relevant columns: "reviewText", "overall"

In [None]:
data = df.loc[:,["reviewText","overall"]]
#Since there are no NaN values in either of the columns we don't need to drop lines.
# We will exclude from our process all 0-length reviews.
data = data.loc[data.loc[:,"reviewText"].apply(lambda x: len(x)) > 0,:]
data.reset_index(inplace = True, drop = True)
print(data.head())

## Check class distribution

In [None]:
graph = plt.bar(data.overall.value_counts().sort_index().index,data.overall.value_counts().sort_index()/1000)
plt.title("Distribution of review scores")
plt.ylabel("Reviews (thousands)")
 
i = 0
for p in graph:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
     
    plt.text(x+width/2,
             y+height*1.01,
             str((round(data.overall.value_counts(normalize = True).sort_index() * 10000)/100)[i+1])+'%',
             ha='center',
             weight='bold')
    i += 1
plt.show()

In [None]:
#So we see that the classes are quite imbalanced, with "overall" = 5 dominating, with over half of the dataset's observations.
#What about reviewText lengths? (That last one is for Word2vec approach, not followed after all.)
lengths = []
data.reviewText.apply(lambda x: lengths.append(len(x)))
data.insert(1,column = "lengths",value = lengths)
print("About review lengths\n Mean: "+str(data.loc[:,"lengths"].mean())+"\n Min: "+str(data.loc[:,"lengths"].min())+"\n Max: "+str(data.loc[:,"lengths"].max())+"\n Median: "+str(data.loc[:,"lengths"].median()))

# 3.Preprocessing

In [None]:
wnl = nltk.stem.WordNetLemmatizer()
reviews = []
cleaned_sentences = []
#Revoming e-mails
data.reviewText = data.reviewText.apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)','', x))
#Removing URLs
data.reviewText = data.reviewText.apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x))
#Removing tags
data.reviewText = data.reviewText.apply(lambda x: gensim.parsing.preprocessing.strip_tags(x))
#Lowercasing and removing stop-words
data.reviewText = data.reviewText.apply(lambda x: gensim.parsing.preprocessing.remove_stopwords(x.lower()))
for rev in data.reviewText:
    temp_review = []
    for sent in rev.split('.'):
        #If there is a sentence
        if len(sent) > 0:
            #Removing non alphanumeric chars
            temp = gensim.parsing.preprocessing.strip_non_alphanum(sent)
            #Removing numeric chars (keeping only letters)
            temp = gensim.parsing.preprocessing.strip_numeric(temp)
            #Removing words with 1 or 2 letters
            temp = gensim.parsing.preprocessing.strip_short(temp, minsize = 3)
            #Trimming possible multiple whitespaces
            temp = gensim.parsing.preprocessing.strip_multiple_whitespaces(temp)
            #Lemmatizing the sentences
            lemma_sent = []
            for word in temp.split():
                lemma_sent.append(wnl.lemmatize(word))
                temp_review.append(wnl.lemmatize(word))
            cleaned_sentences.append(lemma_sent)
    #Store the lematized sentences of a review as one string
    reviews.append(temp_review)

# 4.Training Embeddings
## Word2vec model 

In [None]:
#If there is time, I will try this method as well.
#Uncoment to use.
"""model_w2v = gensim.models.Word2Vec(sentences = cleaned_sentences, vector_size = 200, workers = 8, min_count = 5)
vectors = np.asarray(model_w2v.wv.vectors)
labels = np.asarray(model_w2v.wv.index_to_key)
model_name = temporary_file("model_word2vec")
if SAVE_MODELS:
    model_w2v.save(model_name)"""

## Doc2vec model

In [None]:
#Uncoment to train embeddings. If the trained model is available only run the last line to load it.
"""def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

data_for_training = list(tagged_document(reviews))
model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=5, epochs=30, workers = 8)
model_d2v.build_vocab(data_for_training)
model_d2v.train(data_for_training, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)
if SAVE_MODELS:
   model_d2v.save("model_doc2vec_30epochs")"""
model_d2v = gensim.models.doc2vec.Doc2Vec.load("model_doc2vec_30epochs")

# 5.Training Classifiers
## Preparing data

In [None]:
X = np.array([model_d2v.dv[x] for x in range(len(model_d2v.dv))])
y = np.int8(np.array(data.overall))
#Spliting Training and Testing
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size=.15, shuffle=True, random_state = 42670, stratify = y) # random_state set for reproducibility

## Dealing with the imbalance of the classes

In [None]:
# Trying to deal with imbalance by cutting "overall" == 5 observations in half.
population_5 = round(data.loc[data.overall == 5,:].shape[0]/2)
undersample = RandomUnderSampler(sampling_strategy={5: population_5})
train_texts_under, train_labels_under = undersample.fit_resample(train_texts, train_labels)
# Oversampling the classes for which I have less data
population_1 = round(data.loc[data.overall == 1,:].shape[0] * 1.5)
population_2 = round(data.loc[data.overall == 2,:].shape[0] * 2.2)
population_3 = round(data.loc[data.overall == 3,:].shape[0] * 1.5)
oversample = RandomOverSampler(sampling_strategy={1: population_1, 2: population_2, 3: population_3})
train_texts_balanced, train_labels_balanced = oversample.fit_resample(train_texts_under, train_labels_under)

### Visualizing balanced classes

In [None]:
df = pd.DataFrame({"overall": train_labels_balanced})

graph = plt.bar(df.overall.value_counts().sort_index().index,df.overall.value_counts().sort_index()/1000)
plt.title("Balanced distributions")
plt.ylabel("Reviews (thousands)")
 
i = 0
for p in graph:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy()
     
    plt.text(x+width/2,
             y+height*1.01,
             str((round(df.overall.value_counts(normalize = True).sort_index() * 10000)/100)[i+1])+'%',
             ha='center',
             weight='bold')
    i += 1
plt.show()

## Definig and training the classifiers
### Using 5-fold cross validation training

In [None]:
#Stratified because it helps with the imbalanced classes
n_folds = 5
kf = StratifiedKFold(n_splits = n_folds)

### Linear SVM Classifier

In [None]:
accuracies_svm = []
reports_svm = []
count = 0
for c_arg in [0.01, 0.1, 1, 10]:
    print("C=" +str(c_arg))
    fold_count = 0
    for train_index, test_index in kf.split(train_texts_balanced, train_labels_balanced):
        print("kfold:"+str(fold_count))
        svm_model = LinearSVC(C=c_arg, class_weight="balanced", random_state=42670)
        X_train, X_valid = train_texts_balanced[train_index], train_texts_balanced[test_index]
        y_train, y_valid = train_labels_balanced[train_index], train_labels_balanced[test_index]
        svm_model.fit(X_train,y_train)
        
        val_labels_pred = svm_model.predict(X_valid)
        
        accuracies_svm.append(accuracy_score(val_labels_pred , y_valid))
        reports_svm.append(classification_report(y_valid, val_labels_pred))
        count += 1
        fold_count += 1
    avg_acc_score = sum(accuracies_svm[count-n_folds:count])/n_folds   
    print("accuracy of each fold - {}".format(accuracies_svm[count-n_folds:count]))
    print("Avg accuracy : {}".format(avg_acc_score))


In [None]:
# Training the best model of c=0.01 on the whole of the training data.
svm_model = LinearSVC(C=0.01, class_weight="balanced", random_state=42670)
svm_model.fit(train_texts_balanced,train_labels_balanced)
if SAVE_MODELS:
    dump(svm_model, 'svm_model.joblib')

### Linear Regression Classifier

In [None]:
accuracies_lr = []
reports_lr = []
count = 0
for c_arg in [0.01, 0.1, 1, 10]:
    print("C=" +str(c_arg))
    fold_count = 0
    for train_index, test_index in kf.split(train_texts_balanced, train_labels_balanced):
        print("kfold:"+str(fold_count))
        lr_model = LogisticRegression(C=c_arg, class_weight="balanced", random_state=42670)
        X_train, X_valid = train_texts_balanced[train_index], train_texts_balanced[test_index]
        y_train, y_valid = train_labels_balanced[train_index], train_labels_balanced[test_index]
        lr_model.fit(X_train,y_train)
        
        val_labels_pred = lr_model.predict(X_valid)
        
        accuracies_lr.append(accuracy_score(val_labels_pred , y_valid))
        reports_lr.append(classification_report(y_valid, val_labels_pred))
        count += 1
        fold_count += 1
    avg_acc_score = sum(accuracies_lr[count-n_folds:count])/n_folds   
    print("accuracy of each fold - {}".format(accuracies_lr[count-n_folds:count]))
    print("Avg accuracy : {}".format(avg_acc_score))

In [None]:
# Training the best model of c=0.01 on the whole of the training data.
lr_model = LogisticRegression(C=0.01, class_weight="balanced", random_state=42670)
lr_model.fit(train_texts_balanced,train_labels_balanced)
if SAVE_MODELS:
    dump(lr_model, 'linReg_model.joblib')

### Random Forest Classifier

In [None]:
accuracies_rf = []
reports_rf = []
count = 0
for msl in [1, 2, 3, 4, 5]:
    print("Min leaf="+str(msl))
    fold_count = 0
    for train_index, test_index in kf.split(train_texts_balanced, train_labels_balanced):
        print("kfold:"+str(fold_count))
        rf_model = RFC(min_samples_leaf = msl, n_estimators = 1000, n_jobs = 7, random_state = 42670)
        X_train, X_valid = train_texts_balanced[train_index], train_texts_balanced[test_index]
        y_train, y_valid = train_labels_balanced[train_index], train_labels_balanced[test_index]
        rf_model.fit(X_train,y_train)
        
        val_labels_pred = rf_model.predict(X_valid)
        
        accuracies_rf.append(accuracy_score(val_labels_pred , y_valid))
        reports_rf.append(classification_report(y_valid, val_labels_pred))
        count += 1
        fold_count += 1
    avg_acc_score = sum(accuracies_rf[count-n_folds:count])/n_folds
    print("accuracy of each fold - {}".format(accuracies_rf[count-n_folds:count]))
    print('Avg accuracy : {}'.format(avg_acc_score))

In [None]:
# Training the best model of min_samples_leaf = 5 on the whole of the training data.
rf_model = RFC(min_samples_leaf = 5, n_estimators = 1000, n_jobs = 8, random_state = 42670)
rf_model.fit(train_texts_balanced,train_labels_balanced)
if SAVE_MODELS:
    dump(rf_model, 'RF_model.joblib')

# 6.Test best models
### From the previous training process and the reported metrics, the best performing models selected were:
Linear SVM: c=0.01<br>
Linear Regression: c=0.01<br>
Random Forest: min_samples_leaf = 5<br><br>
Below we are performing predictions on test data, evaluating and visualizing results.

### SVM

In [None]:
svm_preds = svm_model.predict(test_texts)
report_svm = classification_report(test_labels, svm_preds)
print(report_svm)
conf_mat = confusion_matrix(test_labels, svm_preds)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### Linear Regression

In [None]:
lr_preds = lr_model.predict(test_texts)
report_lr = classification_report(test_labels, lr_preds)
print(report_lr)
conf_mat = confusion_matrix(test_labels, lr_preds)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### Random Forest

In [None]:
rf_preds = rf_model.predict(test_texts)
report_rf = classification_report(test_labels, rf_preds)
print(report_rf)
conf_mat = confusion_matrix(test_labels, rf_preds)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()