In [82]:
%matplotlib inline

import os
import re
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

import sklearn
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/laks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
os.getcwd() # can help to obtain the good paths 

'/Users/laks/Desktop/Polytechnique/Cours/Datacamp Capgemini (MAP540)/Day7'

In [3]:
# your path to word2vec google vectors
path_to_google_vectors = '/Users/laks/Desktop/Polytechnique/Cours/Datacamp Capgemini (MAP540)/Day5/'

In [101]:
# If your data are not in the current directory, don't forget : PATH + 'X_HEC_label.csv'
labeled_data = pd.read_csv('X_HEC_label.csv', sep = ",", index_col = False)
labeled_data.head(3)

Unnamed: 0,group_number,review,service,service_sentiment,activities,activities_sentiment,cost,cost_sentiment,family,family_sentiment,food,food_sentiment,infrastructure,infra_sentiment
0,groupe_1,We are veterans to the Center Parcs holiday an...,1.0,-1.0,1.0,1.0,1.0,-1.0,0.0,,1.0,-1.0,1.0,-1.0
1,groupe_1,"Lodge was fine, nice setting, centre is well l...",1.0,-1.0,1.0,1.0,1.0,-1.0,0.0,,0.0,,1.0,1.0
2,groupe_1,"After the madness of Christmas, it was a well ...",1.0,1.0,0.0,,0.0,,1.0,1.0,0.0,,0.0,


In [102]:
labeled_data.shape

(2551, 14)

In [5]:
unlabeled_data = pd.read_json('english_reviews.json') #, sep = ",", index_col = False)
unlabeled_data.head(3)

Unnamed: 0,detected_language,hotel_name,inflected,published_date,rating,review,review_id,review_language,reviewer_id,title,tokens,trip_date
0,en,Center Parcs Sherwood Forest,"[clean, down, hill, look, corner]",31 octobre 2011,4,cleaness downing down hill don t look in corn...,119968750,fr,5EB20A7D530F745C9237B7E5D60B61AB,centre parcs sherwood,"[cleaness, downing, hill, look, corners]",octobre 2011
1,en,Center Parcs Sherwood Forest,"[centre, parcs, sit, numerous, occasion, quite...","February 14, 2019",3,we ve been to centre parcs sites on numerous o...,652429761,en,E8B0404BB28844D394A367DD969BA417,"Nice location, shame about the selfish guest c...","[centre, parcs, sites, numerous, occasions, qu...",February 2019
2,en,Center Parcs Sherwood Forest,"[little, apprehensive, read, recent, review, n...","February 16, 2019",5,i was a little apprehensive reading some of th...,652801542,en,1C083AD19649761DEEF510997AFD804D,Amazing stay!,"[little, apprehensive, reading, recent, review...",February 2019


In [6]:
unlabeled_data['tokens'][0]

['cleaness', 'downing', 'hill', 'look', 'corners']

Just cleaning out the reviews that were badly annotated:

In [7]:
labeled_data = labeled_data[labeled_data.infrastructure != -1]
labeled_data = labeled_data[labeled_data.cost != -1]
labeled_data = labeled_data[labeled_data.family != 9]
labeled_data = labeled_data.reset_index(drop = True)

In [8]:
english_stopwords = ["a", "about", "above", "above", "across", "after", "afterwards",
                     "again", "against", "all", "almost", "alone", "along", "already",
                     "also","although","always","am","among", "amongst", "amoungst",
                     "amount",  "an", "and", "another", "any","anyhow","anyone",
                     "anything","anyway", "anywhere", "are", "around", "as",  "at",
                     "back","be","became", "because","become","becomes", "becoming",
                     "been", "before", "beforehand", "behind", "being", "below",
                     "beside", "besides", "between", "beyond", "bill", "both",
                     "bottom","but", "by", "call", "can", "cannot", "cant", "co",
                     "con", "could", "couldnt", "cry", "de", "describe", "detail",
                     "do", "done", "down", "due", "during", "each", "eg", "eight", 
                     "either", "eleven","else", "elsewhere", "empty", "enough", "etc",
                     "even", "ever", "every", "everyone", "everything", "everywhere",
                     "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", 
                     "five", "for", "former", "formerly", "forty", "found", "four", "from",
                     "front", "full", "further", "get", "give", "good", "great", "woburn", "go",
                     "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", 
                     "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
                     "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest",
                     "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly",
                     "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might",
                     "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must",
                     "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next",
                     "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", 
                     "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", 
                     "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps",
                     "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", 
                     "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty",
                     "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere",
                     "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them",
                     "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", 
                     "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three",
                     "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards",
                     "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", 
                     "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter",
                     "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
                     "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would",
                     "yet", "you", "your", "yours", "yourself", "yourselves", "the"]

In [9]:
def character_replacement(input_string):
    character_mapping = {"\\u00e9": "é",
                        "\\u2019": "'",
                        "\\": "",
                        "\\u00fb": "û",
                        "u00e8": "è",
                        "u00e0": "à",
                        "u00f4": "ô",
                        "u00ea": "ê",
                        "u00ee": "i",
                        "u00fb": "û",
                        "u2018": "'",
                        "u00e2": "a",
                        "u00ab": "'",
                        "u00bb": "'",
                        "u00e7": "ç",
                        "u00e2": "â",
                        "u00f9": "ù",
                        "u00a3": "£",
                        }


    for character in character_mapping:
        input_string = input_string.replace(character, character_mapping[character])

    input_string = input_string.lower()

    characters_to_remove = ["@", "/", "#", ".", ",", "!", "?", "(", ")", "-", "_", "’", "'", "\"", ":", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]
    transformation_dict = {initial: " " for initial in characters_to_remove}
    no_punctuation_reviews = input_string.translate(str.maketrans(transformation_dict))

    return no_punctuation_reviews

In [10]:
def tokenize(input_string):
    return word_tokenize(input_string)

def remove_stop_words(input_tokens, english_stopwords = english_stopwords):
    return [token for token in input_tokens if token not in english_stopwords]

lemmatizer = WordNetLemmatizer()
def lemmatize(tokens, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens

In [11]:
labeled_data['review'] = labeled_data['review'].apply(lambda x: character_replacement(x))
labeled_data['tokens'] = labeled_data['review'].apply(lambda x: tokenize(x))
labeled_data['tokens'] = labeled_data['tokens'].apply(lambda token_list: [meaningful_word for meaningful_word in token_list if len(meaningful_word) > 3])
labeled_data['tokens'] = labeled_data['tokens'].apply(lambda x: remove_stop_words(x))

# COST LABEL

Train test split on manually labeled data:

In [12]:
training_set = {'tokens' : list(labeled_data['tokens'])[:2000],
                'labels' : list(labeled_data['cost'])[:2000]}
training_set = pd.DataFrame(training_set)


test_set = {'tokens' : list(labeled_data['tokens'])[2000:],
            'labels' : list(labeled_data['cost'])[2000:]}
test_set = pd.DataFrame(test_set)

Re-arranging training set to take into account unlabeled tokens and their missing label

In [13]:
semi_supervised_data = {'tokens' : list(training_set['tokens']) + list(unlabeled_data['tokens']),
                        'labels' : list(training_set['labels']) + [-1]*len(unlabeled_data)}

# We use -1 to encode unlabeled samples

semi_supervised_data = pd.DataFrame(semi_supervised_data)
semi_supervised_data.head()

Unnamed: 0,tokens,labels
0,"[veterans, center, parcs, holiday, visited, on...",1.0
1,"[lodge, fine, nice, setting, centre, laid, eas...",1.0
2,"[madness, christmas, earned, enjoyable, break,...",0.0
3,"[visited, centre, parcs, times, past, years, t...",0.0
4,"[stayed, family, group, january, january, days...",1.0


In [19]:
labeled_data.shape
semi_supervised_data.shape

(2543, 15)

In [22]:
labeled_data.head(1)

Unnamed: 0,group_number,review,service,service_sentiment,activities,activities_sentiment,cost,cost_sentiment,family,family_sentiment,food,food_sentiment,infrastructure,infra_sentiment,tokens
0,groupe_1,we are veterans to the center parcs holiday an...,1.0,-1.0,1.0,1.0,1.0,-1.0,0.0,,1.0,-1.0,1.0,-1.0,"[veterans, center, parcs, holiday, visited, on..."


In [23]:
w2v = KeyedVectors.load_word2vec_format(path_to_google_vectors + 'GoogleNews-vectors-negative300.bin', binary = True)

In [24]:
def my_vector_getter(word, wv = w2v) :
    # returns the vector of a word
    try:
        word_array = wv[word].reshape(1,-1)
        return word_array
    except :
        # if word not in google word2vec vocabulary, return vector with low norm
        return np.zeros((1,300))

In [25]:
def document_embedding(text, wv = w2v) :
    # returns naïve document embedding
    embeddings = np.concatenate([my_vector_getter(token) for token in text])
    centroid = np.mean(embeddings, axis=0).reshape(1,-1)
    return centroid

In [26]:
document_embedding(semi_supervised_data['tokens'][0]).shape

(1, 300)

Train embedding:

In [78]:
X = np.zeros((len(semi_supervised_data), 300))

for i in range(len(semi_supervised_data)) :
    X[i] = document_embedding(semi_supervised_data['tokens'][i])
    #X_values = X.values
X_train = X[:2000]
Y_train = training_set['labels'].values

Test embedding:

In [76]:
X_test = np.zeros((len(test_set), 300))

for i in range(len(test_set)) :
    X_test[i] = document_embedding(test_set['tokens'][i])
    
#X_test_pca = pca.transform(X_test)
Y_test = test_set['labels'].values

Fitting the model

In [75]:
label_spreading_model = LabelSpreading()
model_s = label_spreading_model.fit(X_train, Y_train)

In [80]:
pred = model_s.predict(X_test)

In [83]:
print("\n")
print("Using count vectorization")
print("\n")
acc_count = accuracy_score(Y_test,pred)
prec_count = precision_score(Y_test, pred)
sens_count = recall_score(Y_test,pred)

print("Accuracy :", acc_count)
print("Precision :", prec_count)
print("Sensitivity :", sens_count)



Using count vectorization


Accuracy : 0.5561694290976059
Precision : 0.5421455938697318
Sensitivity : 0.9929824561403509


## Propagating:

In [100]:
Y_train.shape

(2000,)

In [103]:
X_train = np.concatenate((X_train,X_test), axis=0)
Y_train = np.concatenate((Y_train,pred), axis=0)

In [106]:
X_train

array([[-0.01672081,  0.07565586,  0.00013237, ..., -0.04164483,
         0.03256852, -0.0333458 ],
       [ 0.0295059 ,  0.04058451, -0.01116494, ..., -0.02598216,
         0.02268141, -0.01136686],
       [ 0.03492011,  0.01960308, -0.07121519, ..., -0.08272236,
        -0.00484411, -0.01405074],
       ...,
       [-0.01097072,  0.02156581, -0.01297506, ..., -0.02637692,
         0.06335892,  0.01691702],
       [ 0.01863052,  0.10220649,  0.00900176, ..., -0.05984081,
         0.10680413, -0.00205485],
       [-0.00565216,  0.06269084,  0.00430571, ..., -0.03780914,
         0.036409  ,  0.00056732]])