## Loads libaries

In [1]:
## Some basic stuff:

import pandas as pd
import nltk
import re
from nltk import TrigramAssocMeasures, BigramAssocMeasures
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from sklearn.utils import shuffle
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve

# Loads data

In [2]:
reviewsdf = pd.read_csv("airportquality.csv")
## Randomizes the order of reviews
reviewsdf=shuffle(reviewsdf)
reviewsdf.sample(2)

Unnamed: 0,airport_name,link,title,author,author_country,date,content,experience_airport,date_visit,type_traveller,overall_rating,queuing_rating,terminal_cleanliness_rating,terminal_seating_rating,terminal_signs_rating,food_beverages_rating,airport_shopping_rating,wifi_connectivity_rating,airport_staff_rating,recommended
17562,yangon-airport,/airport-reviews/yangon-airport,Yangon Airport customer review,Pablo Kleinman,,2/16/2005,Yangon International looks like an airport stu...,,,,,,,,,,,,,0
15305,sao-paulo-guarulhos-airport,/airport-reviews/sao-paulo-guarulhos-airport,Sao Paulo Guarulhos Airport customer review,M Robertson,United Kingdom,1/8/2013,Transited through GRU 3 times in 2.5 weeks Dec...,,,,4.0,2.0,3.0,,,,2.0,,,0


In [3]:
allContent = [row["content"] for index, row in reviewsdf.iterrows()]
reviewsClean = [row["content"] for index, row in reviewsdf.iterrows() if not pd.isnull(row["terminal_cleanliness_rating"])]
Cleanrating = [row["terminal_cleanliness_rating"] for index, row in reviewsdf.iterrows() if not pd.isnull(row["terminal_cleanliness_rating"])]
overallRatingContent = [row["content"] for index, row in reviewsdf.iterrows() if not pd.isnull(row["overall_rating"])]
overallRating= [row["overall_rating"] for index, row in reviewsdf.iterrows() if not pd.isnull(row["overall_rating"])]
print(reviewsClean[2:5])
print(len(reviewsClean))
print(len(Cleanrating))

["This has to be the worst airport experience I've had. Check in queues are ludicrous - the airport simply cannot cope at busy times and you genuinely need to arrive about 3 hours before your flight. If not you'll queue for about 90 minutes then get pulled out into a fast track queue for those flights that are leaving in 60 minutes or less. Then this will take another 20 minutes which will leave you with no time to get through the horrendous security queues. At least the security people were smiling and trying to lighten the mood when we were there. Once in departures there is a depressing lack of any shops or decent food outlets and if you end up at one of the island gate areas there are not enough seats and barely any amenities. Luckily by this point you will probably not have any time to notice this as you will be running to catch your flight. I better not mention that Geneva also has a unique baggage priority handling system which is a polite way of saying they allow people to chec

# Prepares data

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

class Cleaner:
    def __init__(self, in_bigrams = [], in_trigrams= []):
        self.bigrams = in_bigrams
        self.trigrams = in_trigrams
        

    def stem_tokens(self, text):
        tokenizer = RegexpTokenizer(r'\w+')
        stopwords = [] 
        # nltk.corpus.stopwords.words('english')
        tokens = [word for word in tokenizer.tokenize(text.lower())]

        
        stemmed_tokens = [term for term in tokens if term not in stopwords]
        
        if len(self.bigrams) > 0:
            doc_bigrams = list(nltk.bigrams (stemmed_tokens))
            for term in doc_bigrams:
                if term in bigrams:
                    stemmed_tokens.append("_".join(term))
        
        if len(self.trigrams) > 0:
            doc_trigrams = list(nltk.trigrams (stemmed_tokens))
            for term in doc_trigrams:
                if term in trigrams:
                    stemmed_tokens.append("_".join(term))
        
        stemmed_tokens = [stemmer.stem(word) for word in stemmed_tokens 
                          if re.search('[a-zA-Z]', word)]
        
        return stemmed_tokens

fclean = Cleaner()
print(fclean.stem_tokens(reviewsClean[0])[2:10])

['travel', 'extens', 'over', 'the', 'last', 'year', 'through', 'busi']


In [5]:
fcleaner =  Cleaner()
texts = [fcleaner.stem_tokens(text) for text in allContent]

bigramfinder = BigramCollocationFinder.from_documents(texts)
bigramfinder.apply_freq_filter (10)
bigrams = bigramfinder.nbest(BigramAssocMeasures.likelihood_ratio,1000)
bigrams = [(x,y) for x,y in bigrams if x!=y]
print(bigrams[0:5])
trigramfinder = TrigramCollocationFinder.from_documents(texts)
trigramfinder.apply_freq_filter (10)
trigrams = trigramfinder.nbest(TrigramAssocMeasures.likelihood_ratio,1000)
trigrams = [(x,y,z) for x,y,z in trigrams if x!=y or x!=z or y!=z]
print(trigrams[0:5])

[('check', 'in'), ('duti', 'free'), ('passport', 'control'), ('if', 'you'), ('don', 't')]
[('check', 'in', 'the'), ('secur', 'check', 'in'), ('check', 'in', 'desk'), ('check', 'in', 'counter'), ('check', 'in', 'area')]


In [6]:
## Final data prep functions
reviewCleaner = Cleaner(bigrams, trigrams)
vectorizer = CountVectorizer( min_df=0.01, tokenizer=reviewCleaner.stem_tokens)

tfidf = TfidfVectorizer(max_df=0.5, min_df = 0.01, 
                        use_idf = True, tokenizer=reviewCleaner.stem_tokens)

# Predict score Cleanliness MLP Regressor

In [7]:
import sys
import logging

logging.basicConfig(
            format="%(message)s",
            level=logging.DEBUG,
            stream=sys.stdout)

In [8]:
vectorCorpus = vectorizer.fit_transform(overallRatingContent)
vectorCorpusIDF = tfidf.fit_transform(reviewsClean)

In [9]:
MLPpredictorC = MLPRegressor(hidden_layer_sizes=(50), activation="logistic",
                             max_iter=200, learning_rate = "invscaling", learning_rate_init = 0.01, 
                            tol = -1, verbose=True)

MLPpredictorC.fit(vectorCorpus[0:10000], overallRating[0:10000])
print("ok")

Iteration 1, loss = 4.03602917
Iteration 2, loss = 2.82872986
Iteration 3, loss = 2.07628572
Iteration 4, loss = 1.79450632
Iteration 5, loss = 1.62605180
Iteration 6, loss = 1.50470052
Iteration 7, loss = 1.41347211
Iteration 8, loss = 1.31871298
Iteration 9, loss = 1.23895939
Iteration 10, loss = 1.17061733
Iteration 11, loss = 1.10651235
Iteration 12, loss = 1.04230537
Iteration 13, loss = 0.98888145
Iteration 14, loss = 0.92242927
Iteration 15, loss = 0.85154767
Iteration 16, loss = 0.78196845
Iteration 17, loss = 0.70571522
Iteration 18, loss = 0.62717271
Iteration 19, loss = 0.55764893
Iteration 20, loss = 0.49603682
Iteration 21, loss = 0.43702765
Iteration 22, loss = 0.38990602
Iteration 23, loss = 0.33475699
Iteration 24, loss = 0.29707693
Iteration 25, loss = 0.26553500
Iteration 26, loss = 0.23938015
Iteration 27, loss = 0.21321647
Iteration 28, loss = 0.19134935
Iteration 29, loss = 0.17239660
Iteration 30, loss = 0.15637605
Iteration 31, loss = 0.14076512
Iteration 32, los



In [10]:
print(MLPpredictorC.score(vectorCorpus[10000:], overallRating[10000:]))
print("Prediction: " + str(MLPpredictorC.predict(vectorCorpus[11000])) + " Actual: " + str(overallRating[11000]))

0.235467323628
Prediction: [ 2.92574405] Actual: 3.0


In [11]:
MLPpredictorIDF = MLPRegressor(hidden_layer_sizes=(8, 4, 10, 4, 20), activation="logistic", early_stopping = True,
                            validation_fraction= 0.1, max_iter=2000, learning_rate = "invscaling",  learning_rate_init = 0.001)

MLPpredictorIDF.fit(vectorCorpus[0:8000], overallRating[0:8000])
print(MLPpredictorIDF.score(vectorCorpus[8000:], overallRating[8000:]))
print("Prediction: " + str(MLPpredictorIDF.predict(vectorCorpus[10000])) + " Actual: " + str(overallRating[10000]))

0.347027041479
Prediction: [ 6.26488538] Actual: 10.0


# Predict recommended

In [12]:
## Collect data 
reviewsRec = [row["content"] for index, row in reviewsdf.iterrows() ]
ratingRec = [row["recommended"] for index, row in reviewsdf.iterrows()]
vcorpRec = vectorizer.fit_transform(reviewsRec)
#vcorpRecIDF = tfidf.fit_transform(reviewsRec)
#vcorpRecIDF

In [18]:
RecPrecC = MLPClassifier(hidden_layer_sizes=(40, 40), activation="relu", early_stopping = True, verbose = True,
                            validation_fraction = 0.1, max_iter=2000, learning_rate = "invscaling", tol = -1)

RecPrecIDF = MLPClassifier(hidden_layer_sizes=(120, 60, 30, 15, 7, 3), activation="relu",  early_stopping = True,
                            validation_fraction = 0.1, max_iter=300, learning_rate = "invscaling", tol = -0.01)

In [19]:
from sklearn.metrics import confusion_matrix
RecPrecC.fit(vcorpRec[:13000],ratingRec[:13000])
print("Pure count:")
print(RecPrecC.score(vcorpRec[13000:], ratingRec[13000:]))
print(reviewsdf["recommended"].mean(axis=0))
print(confusion_matrix(RecPrecC.predict(vcorpRec[13000:]), ratingRec[13000:]))

#RecPrecIDF.fit(vcorpRecIDF[:13000],ratingRec[:13000])
#print("IDF:")
#print(RecPrecIDF.score(vcorpRecIDF[13000:], ratingRec[13000:]))
#print(reviewsdf["recommended"].mean(axis=0))
#print(confusion_matrix(RecPrecIDF.predict(vcorpRecIDF[13000:]), ratingRec[13000:]))

Iteration 1, loss = 0.51131412
Validation score: 0.793077
Iteration 2, loss = 0.42641728
Validation score: 0.799231
Iteration 3, loss = 0.37768721
Validation score: 0.797692
Iteration 4, loss = 0.33190520
Validation score: 0.797692
Iteration 5, loss = 0.27524740
Validation score: 0.793077
Iteration 6, loss = 0.21338111
Validation score: 0.790769
Iteration 7, loss = 0.14267393
Validation score: 0.800000
Iteration 8, loss = 0.08111738
Validation score: 0.787692
Iteration 9, loss = 0.04349513
Validation score: 0.795385
Iteration 10, loss = 0.02235588
Validation score: 0.786923
Iteration 11, loss = 0.01200979
Validation score: 0.791538
Iteration 12, loss = 0.00794181
Validation score: 0.790000
Iteration 13, loss = 0.00533505
Validation score: 0.793846
Iteration 14, loss = 0.00524413
Validation score: 0.788462
Iteration 15, loss = 0.00429151
Validation score: 0.790769
Iteration 16, loss = 0.00315943
Validation score: 0.793846
Iteration 17, loss = 0.00322398
Validation score: 0.790769
Iterat



In [16]:
test = 3395/(3395+295)
print(test)
test2 = 329/(702+329)
print(test2)

0.9200542005420054
0.31910766246362754


In [17]:
scores = cross_val_score(RecPrecC, vcorpRec, ratingRec, cv=10)



In [None]:
print(scores)

[ 0.79526227  0.78950339  0.78893905  0.80248307  0.7765237   0.79288939
  0.78781038  0.78668172  0.80417607  0.8013544 ]


In [None]:
RecPrecC = MLPClassifier(hidden_layer_sizes=(40, 40), activation="relu",
                            validation_fraction = 0.1, learning_rate = "invscaling", tol = -1, )

train_scores, valid_scores = validation_curve(RecPrecC, vcorpRec, ratingRec, "max_iter", list(range(1,500)), verbose=True, n_jobs=2)