In [1]:
import pandas as pd
import nltk
from nltk import TrigramAssocMeasures, BigramAssocMeasures
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from sklearn import feature_extraction
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
import re

# Loads data 

In [2]:
reviewsdf = pd.read_csv("airportquality.csv")
reviewsdf.sample(2)

Unnamed: 0,airport_name,link,title,author,author_country,date,content,experience_airport,date_visit,type_traveller,overall_rating,queuing_rating,terminal_cleanliness_rating,terminal_seating_rating,terminal_signs_rating,food_beverages_rating,airport_shopping_rating,wifi_connectivity_rating,airport_staff_rating,recommended
1971,bilbao-airport,/airport-reviews/bilbao-airport,Bilbao Airport customer review,Borja Diez,,9/8/2004,The airport designed by Santiago Calatrava it'...,,,,,,,,,,,,,0
17220,vienna-airport,/airport-reviews/vienna-airport,Vienna Airport customer review,Simon Smith,,5/29/2008,I used this airport in transit from LHR to Sof...,,,,2.0,,,,,,,,,0


In [3]:
reviews = [row["content"] for index, row in reviewsdf.iterrows()]
ratings = [row["terminal_cleanliness_rating"] for index, row in reviewsdf.iterrows()]
recommended = [row["recommended"] for index, row in reviewsdf.iterrows()]
print(recommended[2:5])

[1, 0, 0]


#  Set up functions

## Stemming

In [4]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")


class Cleaner:
    def __init__(self, in_bigrams = [], in_trigrams= []):
        self.bigrams = in_bigrams
        self.trigrams = in_trigrams
        
    
    def stem_tokens(self, text):
        tokenizer = RegexpTokenizer(r'\w+')
        stopwords = nltk.corpus.stopwords.words('english')
        tokens = [word for word in tokenizer.tokenize(text.lower())]

        
        stemmed_tokens = [term for term in tokens if term not in stopwords]
        
        if len(self.bigrams) > 0:
            doc_bigrams = list(nltk.bigrams (stemmed_tokens))
            for term in doc_bigrams:
                if term in bigrams:
                    stemmed_tokens.append("_".join(term))
        
        if len(self.trigrams) > 0:
            doc_trigrams = list(nltk.trigrams (stemmed_tokens))
            for term in doc_trigrams:
                if term in trigrams:
                    stemmed_tokens.append("_".join(term))
        
        stemmed_tokens = [stemmer.stem(word) for word in stemmed_tokens 
                          if re.search('[a-zA-Z]', word)]
        
        return stemmed_tokens

cl = Cleaner()
print(cl.stem_tokens(reviews[1])[2:10])

['airport', 'moment', 'expand', 'airport', 'lot', 'build', 'go', 'departur']


## Finding ngram and prepare text

In [5]:
text_cleaner =  Cleaner()
texts = [text_cleaner.stem_tokens(text) for text in reviews]

bigramfinder = BigramCollocationFinder.from_documents(texts)
bigramfinder.apply_freq_filter (30)
bigrams = bigramfinder.nbest(BigramAssocMeasures.likelihood_ratio,200)
bigrams = [(x,y) for x,y in bigrams if x!=y]
print(bigrams[0:5])
trigramfinder = TrigramCollocationFinder.from_documents(texts)
trigramfinder.apply_freq_filter (30)
trigrams = trigramfinder.nbest(TrigramAssocMeasures.likelihood_ratio,200)
trigrams = [(x,y,z) for x,y,z in trigrams if x!=y or x!=z or y!=z]
print(trigrams[0:5])

[('duti', 'free'), ('passport', 'control'), ('x', 'ray'), ('car', 'park'), ('baggag', 'claim')]
[('duti', 'free', 'shop'), ('duti', 'free', 'store'), ('duti', 'free', 'area'), ('duti', 'free', 'expens'), ('small', 'duti', 'free')]


In [6]:
text_cleaner =  Cleaner(bigrams, trigrams)

## Tfidf - Vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define transformer 
tfidf = TfidfVectorizer(max_df=0.5, max_features=200000, 
                        min_df = 0.01, stop_words="english", 
                        use_idf = True, tokenizer=text_cleaner.stem_tokens)


In [8]:
tfidf_reviews = tfidf.fit_transform(reviews)

# Clustering

In [9]:
from sklearn.metrics.pairwise import cosine_distances

def cosine_distance (X, Y=None, Y_norm_squared=None, squared=False):
    return cosine_distances(X, Y)

from sklearn.cluster import k_means_
k_means_.euclidean_distances = cosine_distance

In [10]:
from sklearn.cluster import KMeans
number = 6
km = KMeans(n_clusters=number)
km.fit(tfidf_reviews)


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
clusters = km.labels_.tolist()

In [12]:
ap_reviews = {"content":reviews, "rating" : ratings, "cluster" : clusters, "recommended" : recommended}
frame = pd.DataFrame(ap_reviews, index = [clusters], 
                     columns=['content' , 'rating', 'cluster', 'recommended'])
frame['cluster'].value_counts()
frame.sample(2)

Unnamed: 0,content,rating,cluster,recommended
1,San Juan airport is the best airport in the ca...,,1,0
5,Long queues at immigration and security check ...,3.0,5,0


In [13]:
grouped = frame['recommended'].groupby(frame['cluster'])
print(grouped.mean())
print(frame['cluster'].value_counts())

cluster
0    0.159881
1    0.274449
2    0.195105
3    0.348183
4    0.187166
5    0.158443
Name: recommended, dtype: float64
0    5035
5    4033
3    4018
1    2270
2    1430
4     935
Name: cluster, dtype: int64


In [14]:
texts_by_cluster = []
for i in range(0,number):
    texts_by_cluster.append([])

for index, row in frame.iterrows():
    for i in range (0, number):
        if i == row["cluster"]:
            test = row["content"]
            tokens = text_cleaner.stem_tokens(test)
            texts_by_cluster[i].append(tokens)
            


In [21]:
frame.to_csv("Cluster raing and rec.csv", index= False)

In [15]:
def count_in_cat(texts):
    words_inclass = nltk.TextCollection(texts)
    WordDist = nltk.FreqDist(words_inclass)
    return (WordDist.most_common())

## Keywords by word count

In [16]:
tp = 0 
for cl in texts_by_cluster:
    print(str(tp) + ":" + str(count_in_cat(cl)[0:8]))
    print("")
    tp += 1

0:[('airport', 8751), ('staff', 2770), ('secur', 2466), ('time', 2284), ('one', 1890), ('use', 1836), ('get', 1775), ('travel', 1655)]

1:[('termin', 6401), ('airport', 3423), ('flight', 1539), ('arriv', 1388), ('secur', 1332), ('time', 1163), ('one', 1105), ('check', 1100)]

2:[('passport', 2406), ('control', 1977), ('airport', 1924), ('passport_control', 1696), ('arriv', 1241), ('queue', 1223), ('check', 1070), ('flight', 1057)]

3:[('airport', 6998), ('shop', 2775), ('check', 2146), ('good', 2130), ('secur', 2106), ('area', 1929), ('free', 1879), ('departur', 1674)]

4:[('airport', 1735), ('car', 1520), ('park', 1259), ('termin', 620), ('get', 584), ('arriv', 553), ('minut', 510), ('time', 503)]

5:[('airport', 6202), ('flight', 5764), ('arriv', 4253), ('check', 4133), ('time', 3614), ('secur', 3599), ('hour', 2891), ('immigr', 2886)]



## Keyword by relevance

In [17]:
import math
from operator import itemgetter
# I now create a function to compute relevancy of a word given a topic.
# Formula used is from https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
# now the actual funtion definiton AllW: a list of all tokens, WinT: A list of all tokens in topic, l: constant
def computeRelevance (AllW , WinT, l):
    #First we count word in AllW
    AllCol = nltk.TextCollection(AllW)
    AllDist = nltk.FreqDist(AllCol)
    #Next we do the same for the topic
    TCol = nltk.TextCollection(WinT)
    TDist = nltk.FreqDist(TCol)
    
    wordRelv = []
    for w, c in TDist.items():
        relevance = math.exp((l * math.log(c/len(TCol)) + ((1-l) * math.log((c/len(TCol))/(AllDist[w]/len(AllCol))))))
        wordRelv.append((w,relevance))

    return sorted(wordRelv, key=itemgetter(1), reverse=True)

In [18]:
alltext = []
for cl in texts_by_cluster:
    alltext = alltext + cl 

tp = 0 
for cl in texts_by_cluster:
    print(str(tp) + ":" + str(computeRelevance(alltext, cl, 0.6)[0:10]))
    print("")
    tp +=1

0:[('airport', 0.14582586726181873), ('staff', 0.07955501060934923), ('secur', 0.060820750333789586), ('travel', 0.059214093267843444), ('time', 0.05849654062816647), ('use', 0.056712869016471684), ('one', 0.05315402207338747), ('get', 0.04895570308155398), ('go', 0.04568842556848302), ('year', 0.04427868376136235)]

1:[('termin', 0.264357898069231), ('airport', 0.09917191804106196), ('new', 0.07123276141982192), ('intern', 0.06964515976635742), ('domest', 0.06441081018702124), ('flight', 0.06387705808255645), ('arriv', 0.06088276669782224), ('secur', 0.0571173244780639), ('use', 0.056980893463789935), ('one', 0.05403081715749039)]

2:[('passport', 0.23600464721988956), ('control', 0.21876116758297146), ('passport_control', 0.2157978603143806), ('queue', 0.09760589868737307), ('airport', 0.08452982460030525), ('arriv', 0.08254671248589221), ('check', 0.07181008089945155), ('flight', 0.06652798350927772), ('get', 0.06420866706132745), ('secur', 0.05936869393781771)]

3:[('airport', 0.14

In [25]:
topic4 = computeRelevance(alltext, texts_by_cluster[3], 0.6)
word4 =[]
rel4 = []
for x, y in topic4:
    word4.append(x)
    rel4.append(y)

topic5w = {"word": word4, "relevance" : rel4}
topic5Words = pd.DataFrame(topic4, 
                     columns=['word' , 'relevance'])


In [26]:
topic5Words.to_csv("topic4words.csv", index= False)

In [None]:
topic0 = computeRelevance(alltext, texts_by_cluster[0], 0.6)
word0 =[]
rel0 = []
for x, y in topic0:
    word0.append(x)
    rel0.append(y)

topic0w = {"word": word0, "relevance" : rel0}
topic0Words = pd.DataFrame(topic0w, 
                     columns=['word' , 'relevance'])


In [None]:
topic0Words.to_csv("topic0words.csv", index= False)

## Predicting  in each cluster

In [None]:
frameShuffel = shuffle(frame)
frame.sample(1)

In [None]:
cReviews = [row["content"] for index, row in frameShuffel.iterrows()]
cCluster = [row["cluster"] for index, row in frameShuffel.iterrows()]
cRecommend = [row["recommended"] for index, row in frameShuffel.iterrows()]
print(len(cReviews))
print(len(cCluster))
print(len(cRecommend))

In [None]:
review_by_Cluster = []
for i in range(0,10):
    review_by_Cluster.append([])
recommend_by_Cluster = []
for i in range(0,10):
    recommend_by_Cluster.append([])
    
for x in range(0, 15000):
    review_by_Cluster[cCluster[x]].append(cReviews[x])
    recommend_by_Cluster[cCluster[x]].append(cRecommend[x])

print(review_by_Cluster[1][1])
print(recommend_by_Cluster[1][1])

In [None]:
vectorizer = CountVectorizer(max_df=0.5, min_df=0.01, tokenizer=text_cleaner.stem_tokens)
vectorizer.fit(cReviews)

In [None]:
## learning
networks = []
for i in range(0,7):
    networks.append(MLPClassifier(hidden_layer_sizes=(80, 80), activation="relu", early_stopping = True,
                            validation_fraction = 0.1, max_iter=400, learning_rate = "invscaling", tol = -0.01))
    vRv = vectorizer.transform(review_by_Cluster[i])
    score = recommend_by_Cluster[i]
    networks[i].fit(vRv, score)
print(len(networks))

In [None]:
#Testing
Predicted = []
tp = 0
tn = 0
fp = 0
fn = 0
for x in range(15000,17721):
    vec = vectorizer.transform([cReviews[x]])
    prediction = networks[cCluster[x]].predict(vec)[0]
    
    if prediction == 1:
        if cRecommend[x] == 1:
            tp += 1
        else:
            fp += 1
    elif prediction == 0:
        if cRecommend[x] == 0:
            tn += 1
        else: 
            fn += 1

print("Overall acc:" + str((tp+tn) / (tp+tn+fp+fn)))
print("tp: " + str(tp) + " fn: " + str(fn))
print("fp: " + str(fp) + " tn: " + str(tn))

# Kmeans with Cosine distance

In [None]:
from sklearn.metrics.pairwise import cosine_distances

def cosine_distance (X, Y=None, Y_norm_squared=None, squared=False):
    return cosine_distances(X, Y)

from sklearn.cluster import k_means_
k_means_.euclidean_distances = cosine_distance

In [None]:
km.fit(tfidf_reviews)