# Making all necessary imports

In [1]:
import json
import re
from string import punctuation
from stopwords import french_stopwords, english_stopwords
import pandas as pd
from textblob import TextBlob
import gensim
from gensim import corpora

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/laks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/laks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading data

In [2]:
with open("full2.json", "r") as read_file:
    data = json.load(read_file)

# Functions for text processing

In [3]:
def character_replacement(input_string):
    character_mapping = {"\\u00e9": "é",
                        "\\u2019": "'",
                        "\\": "",
                        "\\u00fb": "û",
                        "u00e8": "è",
                        "u00e0": "à",
                        "u00f4": "ô",
                        "u00ea": "ê",
                        "u00ee": "i",
                        "u00fb": "û",
                        "u2018": "'",
                        "u00e2": "a",
                        "u00ab": "'",
                        "u00bb": "'",
                        "u00e7": "ç",
                        "u00e2": "â",
                        "u00f9": "ù",
                        "u00a3": "£",
                        }
    for character in character_mapping:
        input_string = input_string.replace(character, character_mapping[character])

    input_string = input_string.lower()

    characters_to_remove = ["@", "/", "#", ".", ",", "!", "?", "(", ")", "-", "_", "’", "'", "\"", ":", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]
    transformation_dict = {initial: " " for initial in characters_to_remove}
    no_punctuation_reviews = input_string.translate(str.maketrans(transformation_dict))

    return no_punctuation_reviews

def tokenize(input_string):
    return word_tokenize(input_string)

def remove_stop_words_french(input_tokens):
    return [token for token in input_tokens if token not in french_stopwords]

def remove_stop_words_english(input_tokens):
    return [token for token in input_tokens if token not in english_stopwords]

# Inflected languages

In [4]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens

# Stemming
frenchStemmer=SnowballStemmer("french")
def stem(tokens):
    tokens = [frenchStemmer.stem(token) for token in tokens]
    return tokens

# Processing data

In [5]:
# Creating a dataFrame with all reviews
reviews = pd.DataFrame.from_dict(data)

# Making basic cleaning
reviews.review = reviews.review.apply(lambda x: character_replacement(x))
reviews["tokens"] = reviews.review.apply(lambda x: tokenize(x))
reviews.tokens = reviews.tokens.apply(lambda token_list: [meaningful_word for meaningful_word in token_list if len(meaningful_word) > 3])

# Splitting reviews by language
french_reviews = reviews[reviews.review_language == "fr"]
english_reviews = reviews[reviews.review_language == "en"]

# removing stopwords
french_reviews.tokens = french_reviews.tokens.apply(lambda x: remove_stop_words_french(x))
english_reviews.tokens = english_reviews.tokens.apply(lambda x: remove_stop_words_english(x))

# Tokenizing
english_reviews['inflected'] = english_reviews['tokens'].apply(lemmatize)
french_reviews['inflected'] = french_reviews['tokens'].apply(stem)

# Preparing LDA inputs
dictionary = corpora.Dictionary(english_reviews['inflected'])
doc_term_matrix = [dictionary.doc2bow(rev) for rev in english_reviews['inflected']]


# LDA 

In [6]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
num_topics = 5
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary,
                num_topics=num_topics,
                alpha=[0.0001] * num_topics,
                eta=[0.0001] * len(dictionary),
                chunksize=2000,
                passes=6,
                random_state=100,
               )

print(lda_model.print_topics(num_words=8))

[(0, '0.022*"time" + 0.017*"stay" + 0.015*"staff" + 0.015*"lodge" + 0.014*"year" + 0.013*"parcs" + 0.012*"visit" + 0.012*"family"'), (1, '0.018*"lodge" + 0.015*"pool" + 0.013*"bike" + 0.012*"kid" + 0.012*"area" + 0.011*"time" + 0.011*"activity" + 0.010*"child"'), (2, '0.021*"pool" + 0.015*"small" + 0.015*"area" + 0.012*"parcs" + 0.012*"room" + 0.011*"woburn" + 0.010*"clean" + 0.009*"swim"'), (3, '0.018*"service" + 0.017*"lodge" + 0.013*"book" + 0.012*"staff" + 0.011*"go" + 0.010*"time" + 0.010*"leave" + 0.010*"guest"'), (4, '0.021*"price" + 0.015*"food" + 0.015*"activity" + 0.014*"parcs" + 0.014*"time" + 0.012*"holiday" + 0.012*"expensive" + 0.012*"center"')]
