# Neural network on Airport reviews data (NN from sklearn). Recommended rating prediction


Importing the libraries. We are going to use pandas to load data and MLPClassifier from sklearn for NN.

In [133]:
#Natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
import pandas as pd
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
import re as re


# Used for stemming words
stemmer = LancasterStemmer()

Reading the data file, droping unused attributes

In [134]:
#Reads CVS file as pandas
def readDataPandas():
    reader = pd.read_csv('Project_Code/data/airlinequality.csv',encoding = 'utf8')

    return reader

#Removes unused columns
def chooseTheAttributes(dataset):

    new_data = dataset.drop(['link', 'title','airport_name','author',
                             'author_country','overall_rating',
                             'queuing_rating','airport_shopping_rating',
                             'date','experience_airport',
                             'date_visit','type_traveller',
                             'terminal_cleanliness_rating',
                             'terminal_seating_rating',
                             'terminal_signs_rating',
                             'food_beverages_rating',
                             'wifi_connectivity_rating',
                             'airport_staff_rating'], axis=1)

    return new_data

dataset = readDataPandas()
dataset= chooseTheAttributes(dataset)
#print(dataset)

<li>Total length of the dataset: <b>17721</b>
<li>Training dataset length: <b>12721</b>
<li>Test dataset length: <b>5000</b>

In [135]:
new_train_dataset = dataset[0:12721]
testing_results_dataset = dataset[12721:]

print("Train dataset length %s" % len(new_train_dataset))
print("Test dataset length %s" % len(testing_results_dataset))

Train dataset length 12721
Test dataset length 5000


In [136]:
def reviewsToWords(review):



    letters_only = re.sub("[^a-zA-Z]",  # The pattern to search for
                          " ",  # The pattern to replace it with
                          review)  # The text to search

    lower_case = letters_only.lower() #Converts to lower case
    words = lower_case.split()  #Splits into seperate words

    stops = set(stopwords.words("english"))

    wordsOfReview = [w for w in words if not w in stops] #Removes un-useful words (stops)

    returnValue = ( " ".join(wordsOfReview))   #Joins together words with space


    return returnValue

Importing CountVectorizer which takes the data and transforms it to vectorized array.

In [139]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

count_positive=0
count_negative=0
training_dataset=[]
for index, row in new_train_dataset.iterrows():
        training_dataset.append(reviewsToWords(row['content']))
        if(row['recommended'] == 1):
            count_positive+=1
        else:
            count_negative+=1
  
#Fits all the data
X = vectorizer.fit_transform(training_dataset)

print("There are %s positive and %s negative values" % (count_positive,count_negative))
print("The percentage - %s positive, %s negative" % ((count_positive/float(count_positive+count_negative))*100,
                                                     (count_negative/float(count_positive+count_negative))*100))



There are 2814 positive and 9907 negative values
The percentage - 22.1209024448 positive, 77.8790975552 negative


Taking one review by one and transforming it to vectorized array compared to the whole dataset.
<li> Example: </li>
full = ["Mary likes dogs"] <br/>
train = ["I like cats"] <br/>
output = [0 1 0]

In [140]:
# Create our training data
training = []
output = []

# Create an empty array for our output
output_empty = [0] * 2

for index, row in new_train_dataset.iterrows():
    bag=[]
    
    bag.append(vectorizer.transform([row['content']]).toarray())
    
    training.append(bag[0][0])

    if (row['recommended'] == 1):
        output.append([1,0])
    else:
        output.append([0,1])
        

Creating classifier.<br/>
<b>learning_rate </b> = adaptive <br/>
<b>for activation function we use </b> = the hyperbolic tan function <br/>
<b>solver </b> = 'adam' (works best for large datasets) <br />
<b>hidden layer size </b> = 1 layer, 8 neurons


In [141]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(max_iter=100,
                    learning_rate='adaptive',
                    activation='logistic',
                    solver='lbfgs', 
                    alpha=1e-5,
                    hidden_layer_sizes=(8, 1), 
                    )

Fit training and output lists. <i>(Takes a long time)</i>

In [142]:
import numpy as np
X = np.array(training)
y = np.array(output)
clf.fit(X, y)

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(8, 1), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

Overall evaluation, %

In [143]:
#Accuracy counter and total counter
acc_counter = 0
total = len(testing_results_dataset)
one_counter=0
total_ones = 0
zero_counter = 0
total_zeros =0

#Goes throught test data, compares the results with classify
for index, row in testing_results_dataset.iterrows():
    result = vectorizer.transform([reviewsToWords(row['content'])]).toarray()
    result = clf.predict(result)
    if(result[0][0] == row['recommended']):
        acc_counter+=1
        
    #Testing if the algorithm is not just guessing 0 and getting high accuracy        
    if(row['recommended'] == 1):
        total_ones+=1
    if(row['recommended'] == 0):
        total_zeros+=1
    if(result[0][0] == row['recommended'] and row['recommended'] == 1):
         one_counter+=1  
    if(result[0][0] == row['recommended'] and row['recommended'] == 0):
         zero_counter+=1  
        
print("Total accuracy: ")
print(acc_counter/float(total) * 100)  
print("True positive percentage %s" % (one_counter/float(total_ones) * 100))
print("True negative percentage %s" % (zero_counter/float(total_zeros) * 100))
print("False positive percentage %s" % (100 - (one_counter/float(total_ones) * 100)))
print("False negative percentage %s" % (100 - (zero_counter/float(total_zeros) * 100)))

Total accuracy: 
71.68
True positive percentage 66.4556962025
True negative percentage 73.1638418079
False positive percentage 33.5443037975
False negative percentage 26.8361581921
