In [35]:
import matplotlib.pyplot as plt
plt.style.use({'figure.facecolor':'white'})
from nltk.classify import NaiveBayesClassifier
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [36]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

In [37]:
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')

features_positive = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in negative_fileids]

print(features_positive[0])
print(features_negative[0])

({'films': True, 'adapted': True, 'from': True, 'comic': True, 'books': True, 'have': True, 'had': True, 'plenty': True, 'of': True, 'success': True, ',': True, 'whether': True, 'they': True, "'": True, 're': True, 'about': True, 'superheroes': True, '(': True, 'batman': True, 'superman': True, 'spawn': True, ')': True, 'or': True, 'geared': True, 'toward': True, 'kids': True, 'casper': True, 'the': True, 'arthouse': True, 'crowd': True, 'ghost': True, 'world': True, 'but': True, 'there': True, 's': True, 'never': True, 'really': True, 'been': True, 'a': True, 'book': True, 'like': True, 'hell': True, 'before': True, '.': True, 'for': True, 'starters': True, 'it': True, 'was': True, 'created': True, 'by': True, 'alan': True, 'moore': True, 'and': True, 'eddie': True, 'campbell': True, 'who': True, 'brought': True, 'medium': True, 'to': True, 'whole': True, 'new': True, 'level': True, 'in': True, 'mid': True, '80s': True, 'with': True, '12': True, '-': True, 'part': True, 'series': True

In [38]:
import sqlite3

sqlite_connection = sqlite3.connect("tweets/database.sqlite")
cursor = sqlite_connection.cursor()
print("Connected to database.")
query = "select text, airline_sentiment from Tweets where airline_sentiment != 'neutral';"
rows = cursor.execute(query).fetchall()
input_text = []
output_sentiment = []
for row in rows:
    input_text.append(row[0])
    output_sentiment.append('Negative' if row[1] == 'negative' else 'Positive')


Connected to database.


In [39]:
features = [(extract_features(text.strip().split(" ")), sentiment) for text, sentiment in zip(input_text, output_sentiment)]
for feature in features[:4]:
    print(feature)

({'@JetBlue': True, 'is': True, 'REALLY': True, 'getting': True, 'on': True, 'my': True, 'nerves': True, '!!': True, '😡😡': True, '#nothappy': True}, 'Negative')
({'@united': True, 'yes.': True, 'We': True, 'waited': True, 'in': True, 'line': True, 'for': True, 'almost': True, 'an': True, 'hour': True, 'to': True, 'do': True, 'so.': True, 'Some': True, 'passengers': True, 'just': True, 'left': True, 'not': True, 'wanting': True, 'wait': True, 'past': True, '1am.': True}, 'Negative')
({'@united': True, 'the': True, 'we': True, 'got': True, 'into': True, 'gate': True, 'at': True, 'IAH': True, 'on': True, 'time': True, 'and': True, 'have': True, 'given': True, 'our': True, 'seats': True, 'closed': True, 'flight.': True, 'If': True, 'you': True, 'know': True, 'people': True, 'is': True, 'arriving,': True, 'to': True, 'wait': True}, 'Negative')
({'@SouthwestAir': True, 'its': True, 'cool': True, 'that': True, 'my': True, 'bags': True, 'take': True, 'a': True, 'bit': True, 'longer,': True, 'd

In [40]:
from sklearn.model_selection import train_test_split
input_features = [elem[0] for elem in features]
output = [elem[1] for elem in features]
x_train, x_test, y_train, y_test = train_test_split(input_features, output, test_size=.2, shuffle=True)
print('Train size:', len(x_train))
print('Test size', len(x_test))
train_data = [(feature, out) for feature, out in zip(x_train, y_train)]
validation_data = [(feature, out) for feature, out in zip(x_test, y_test)]

Train size: 9132
Test size 2284


In [41]:
classifier = NaiveBayesClassifier.train(train_data)
print('Accuracy for the classifier: ', nltk.classify.util.accuracy(classifier, validation_data))


Accuracy for the classifier:  0.8918563922942206


## Compute old posts from multiple train data.

In [42]:
from random import shuffle
size_of_post = 5
indexes = [i for i in range(len(validation_data))]
shuffle(indexes)
posts = []
while indexes:
    posts_text = []
    sentiments = []
    for idx in indexes[:size_of_post]:
        posts_text += validation_data[idx][0]
        sentiments += [validation_data[idx][1]]
        indexes.remove(idx)
    posts.append((extract_features(posts_text), "Negative" if sentiments.count("Negative") > size_of_post // 2 else "Positive"))

for post in posts[:3]:
    print('Text:', post[0])
    print('Sentiment: ', post[1])

Text: {'@AmericanAir': True, 'when': True, 'the': True, 'pilot': True, 'announces': True, 'that': True, 'plane': True, 'has': True, 'been': True, 'unused': True, 'for': True, '2': True, 'days': True, 'and': True, 'maintenance': True, 'is': True, 'dealing': True, 'with': True, 'another': True, 'aircraft..': True, '@united': True, 'No': True, 'but': True, 'u': True, 'cld': True, 'explain': True, 'how': True, 'such': True, 'a': True, 'disorganized': True, 'inefficient': True, 'company': True, 'w': True, 'surly': True, 'obviously': True, 'unhappy': True, 'employees': True, 'stays': True, 'in': True, 'business.': True, '@SouthwestAir': True, 'looks': True, 'like': True, 'Bellagio': True, 'to': True, 'me!': True, 'Good': True, 'luck': True, 'people!': True, 'See': True, 'you': True, 'at': True, 'show': True, 'Vinyl': True, '@USAirways': True, 'big': True, 'thanks': True, 'gate': True, 'agent': True, 'flt5127..keeping': True, 'us': True, 'informed': True, 'of': True, 'delay#greatcustomerservi

In [43]:
print('Accuracy for the classifier with posts: ', nltk.classify.util.accuracy(classifier, posts))

Accuracy for the classifier with posts:  0.9409190371991247
