## Data Exploration, Cleanup and Analysis

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))


In [2]:
#Essentials
import numpy as np
import pandas as pd
import pickle

#SQL related
import sqlite3
import pandas.io.sql as pd_sql

#API related
import requests

#Preprocessing
import re
from geotext import GeoText
from calendar import month_name
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

#Sentiment analysis
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [3]:
#Setting up for working with SQLite database
sqlite_file = '/Users/auste_m/ds/metis/metisgh/github/metis_projects/Customer_Review_Sentiment_Analysis/Datasets/twitter-airline-sentiment/database.sqlite'

conn = sqlite3.connect(sqlite_file)
cursor = conn.cursor()

In [4]:
#Check one of the rows in the table
preview = cursor.execute("SELECT * FROM Tweets LIMIT 20")
columns = [column[0] for column in preview.description]
print('The columns of the table are:' + ' \n' + str(columns) + '\n')
print('Preview of one of the rows in the table:' + '\n' + str(preview.fetchone()))

The columns of the table are: 
['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']

Preview of one of the rows in the table:
(567588278875213824, 'neutral', 1, '', '', 'Delta', '', 'JetBlueNews', '', 0, "@JetBlue's new CEO seeks the right balance to please passengers and Wall ... - Greenfield Daily Reporter http://t.co/LM3opxkxch", '', '2015-02-16 23:36:05 -0800', 'USA', 'Sydney')


In [5]:
#Retrieve relevant information from Tweets table in SQLite database and store them in a pandas dataframe
query = """SELECT airline, retweet_count, text as 'tweet' 
            FROM Tweets"""


tweets_df = pd.read_sql_query(query, conn)

In [6]:
#Sanity check
print(tweets_df.info())
tweets_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14485 entries, 0 to 14484
Data columns (total 3 columns):
airline          14485 non-null object
retweet_count    14485 non-null int64
tweet            14485 non-null object
dtypes: int64(1), object(2)
memory usage: 339.6+ KB
None


Unnamed: 0,airline,retweet_count,tweet
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...
2,United,0,@united yes. We waited in line for almost an h...
3,United,0,@united the we got into the gate at IAH on tim...
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...


### Put aside a test set

In [7]:
tweets_test = tweets_df[10000:12000]
tweets_train = tweets_df[:10000]
tweets_train = tweets_train.append(tweets_df[12000:])

In [8]:
# tweets_train.info()
# tweets_test.info()

#### Let's gather airport information from an external API

In [9]:
#First need to get global airport database through an API request 
airport_db_url = 'https://aviation-edge.com/api/public/airportDatabase?key=42e87b-a2f1be-c446fa-06d7a2-012f14'
get_response = requests.get(airport_db_url)
airport_db = get_response.json()

#Then I need to extract the information that is relevant to me (airport names and codes)
airport_info = []

for airport in airport_db:
    airport_info.append(airport['codeIataAirport'])
    airport_info.append(airport['nameAirport'])

#Test that results make sense
# if 'IAH' in airport_codes:
#     print(airport_db[airport_codes.index('IAH')])
# else:
#     print(False)

In [10]:
len(airport_info)

20102

### Let's set up some helped functions

In [11]:
#helper function to remove stuff from tweets

def remove_airline(string):
    """Takes a string as input.
    Returns the same string with hashtag removed."""
    pattern1 = re.compile('@[A-Za-z]+\w')
    new_string = string
    try:
        all_airlines = pattern1.findall(new_string)
        for airline in all_airlines:
            new_string = re.sub(airline, '', new_string)
    except:
        pass
    return new_string

def remove_hashtag(string):
    """Takes a string as input.
    Returns the same string with hashtag removed."""
    pattern2 = re.compile('#\w+')
    new_string = string
    try:
        all_hashtags = pattern2.findall(new_string)
        for hashtag in all_hashtags:
            new_string = re.sub(hashtag, '', new_string)
    except:
        pass
    return new_string


def remove_code(string):
    """Takes a string as input.
    Returns the same string with any capital letter & digit combination text removed."""
    pattern3 = re.compile('[A-Z]?\d+[A-Z]+')
    pattern4 = re.compile('\d+')
    new_string = string
    try:
        codes = pattern3.findall(new_string)
        codes.extend(pattern4.findall(new_string))
        for elem in codes:
            new_string = re.sub(elem, '', new_string)
    except:
        pass
    return new_string
    
    
def remove_url(string):
    """Takes a string as input.
    Returns the same string with any urls removed removed."""
    pattern5 = re.compile('http://t.co/\w+')
    new_string = string
    try:
        urls = pattern5.findall(new_string)
        for url in urls:
            new_string = re.sub(url, '', new_string)
    except:
        pass
    return new_string


def remove_location(string):
    """Takes a string as input.
    Returns a new string with location information removed."""
    new_string = string
    geo_loc = GeoText(string)
    locations = []
    if geo_loc.cities != []:
        locations.extend(geo_loc.cities)
    if geo_loc.countries != []:
        locations.extend(geo_loc.countries)
    try:
        for loc in locations:
            new_string = re.sub(loc, '', new_string)
    except:
        pass
    return new_string
    
    
def remove_month(string):
    """Takes a string as input.
    Returns a new string with month information removed."""
    new_string = string
    all_months = month_name[1:]
    try:
        for word in string.split():
            if word in all_months:
                new_string = re.sub(word, '', new_string)
            else:
                continue
    except:
        pass
    return new_string


def remove_emoji(string):
    """Takes a string as input.
    Returns a new string with emojis removed."""    
    pattern6 = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    new_string = string
    try:
        emojis = pattern6.findall(new_string)
        for emoji in emojis:
            new_string = re.sub(emoji, '', new_string)
    except:
        pass
    return new_string


def vectorize_emoji(string):
    """Takes a string and picks put all emojis
    Returns a sentiment associated with the emojis."""
    emoji_sentiment = string.emoji2vec()
    return emoji_sentiment

def remove_airport(string):
    """Takes a string as input.
    Returns a new string with airport codes removed.""" 
    new_string = string
    try:
        for word in string.split():
            if word in airport_info:
                new_string = re.sub(word, '', new_string)
            else:
                continue
    except:
        pass
    return new_string


def get_Vader_sentiment(string):
    """Takes a string as input. Uses text processing mashape API to retrieve the sentiment.
    Returns a sentiment label and score (from -1 to 1, negatives signalling negative sentiment).""" 
    SIA = SentimentIntensityAnalyzer()
    total_score = SIA.polarity_scores(string)
    if total_score['compound'] < -0.05:
        sentiment = 'negative'
    elif total_score['compound'] >= 0.2:
        sentiment = 'positive'
    else:
        sentiment = 'neutral'
    polarity = total_score['compound']
    return (sentiment, polarity)


def get_TextBlob_sentiment(string):
    """Takes a string as input. Uses text processing mashape API to retrieve the sentiment.
    Returns a sentiment label and score (from -1 to 1, negatives signalling negative sentiment).""" 
    sentiment_all = TextBlob(string).sentiment
    if sentiment_all.polarity < -0.05:
        sentiment = 'negative'
    elif sentiment_all.polarity >= 0.2:
        sentiment = 'positive'
    else:
        sentiment = 'neutral'
    polarity = sentiment_all.polarity
    return (sentiment, polarity)


# def get_sentiment(string):
#     """Takes a string as input. Uses text processing mashape API to retrieve the sentiment.
#     Returns a sentiment label and score (from -1 to 1, negatives signalling negative sentiment).""" 
#     #API stuff
#     sentiment_API_url = 'https://japerk-text-processing.p.mashape.com/sentiment/'
#     sentiment_API_response = requests.post(sentiment_API_url,
#                                           data={
#                                             "language": "english",
#                                             "text": test_tweet}
#                                               ,
#                                           headers={
#                                             "X-Mashape-Key": "3hN4k8H8Brmsh0Hp4sTefboY6vHpp1qZZ3jjsnvlGiMsNSK59o",
#                                             "Content-Type": "application/x-www-form-urlencoded",
#                                             "Accept": "application/json"}
#                                           )

#     sentiment_all = sentiment_API_response.json()
#     sentiment = sentiment_all['label']
#     neg_prob = sentiment_all['probability']['neg']
#     return (sentiment, neg_prob)


### Sentiment labelling

In [12]:
#Are the tweets positive or negative (tried TextBlob.sentiment, TextBlob.NaiveBayesAnalyzer, nltk.Vader)

test_tweet = tweets_train['tweet'][14063]
label_TB = TextBlob(test_tweet).sentiment
label_NB = TextBlob(test_tweet, analyzer=NaiveBayesAnalyzer()).sentiment

SIA = SentimentIntensityAnalyzer()
total_score = SIA.polarity_scores(test_tweet)
print("Example tweet: '"+ str(test_tweet), "'")
print('Scores using TextBlob default', str(label_TB))
print('Scores using TextBlob Naive Bayes classifier', str(label_NB))
print('Scores using Vader', str(total_score))
# print('Label and negative probability using sentiment API', str(get_sentiment(test_tweet)))

Example tweet: '@SouthwestAir need to learn how to treat people with respect and just a little dignity. #FAIL '
Scores using TextBlob default Sentiment(polarity=-0.34375, subjectivity=0.39999999999999997)
Scores using TextBlob Naive Bayes classifier Sentiment(classification='pos', p_pos=0.7744402760303103, p_neg=0.22555972396969037)
Scores using Vader {'neg': 0.0, 'neu': 0.594, 'pos': 0.406, 'compound': 0.8024}


In [13]:
#Adding a column for sentiment label and negative score each
tweets_train['sentiment_TextBlob'] = [get_TextBlob_sentiment(tweet)[0] for tweet in tweets_train['tweet']]
tweets_train['polarity_TextBlob'] = [get_TextBlob_sentiment(tweet)[1] for tweet in tweets_train['tweet']]
tweets_train['sentiment_Vader'] = [get_Vader_sentiment(tweet)[0] for tweet in tweets_train['tweet']]
tweets_train['polarity_Vader'] = [get_Vader_sentiment(tweet)[1] for tweet in tweets_train['tweet']]
# tweets_train['sentiment'] = [get_sentiment(tweet)[0] for tweet in tweets_train['tweet']]
# tweets_train['neg_sentiment_prob'] = [get_sentiment(tweet)[1] for tweet in tweets_train['tweet']]

#### Checking the discrepancies between TextBlob and Vader approaches

In [14]:
# tweets_train[(tweets_train['sentiment_TextBlob'] == 'positive') & (tweets_train['sentiment_Vader'] == 'negative')]

In [15]:
# tweets_train[(tweets_train['sentiment_Vader'] == 'positive') & (tweets_train['sentiment_TextBlob'] == 'negative')].sort_values(by='polarity_Vader', ascending=False)

In [16]:
#tweets_train['tweet'][14063]

### Let's get sweeping before clustering

#### Examining hashtag containing tweets

In [17]:
# #Identify all the tweets containing hashtags
# pattern2 = re.compile('#[A-Za-z]+\w')
# count_hash_tweets = 0

# for index, tweet in enumerate(tweets_train['tweet']):
#     try:
#         h_tweet = pattern2.search(tweet).group()
#         print(index, h_tweet)
#         count_hash_tweets += 1
#     except:
#         continue
        
# print('\nTotal number of tweets containing hashtags =', str(count_hash_tweets))

#### Same for urls

In [18]:
# # Find url pattern
# https_list = []

# for tweet in tweets_train['tweet']:
#     if url_remove(tweet) == []:
#         pass
#     else:
#         https_list.append(url_remove(tweet))
        
# print(https_list)

In [19]:
# #Testing code_remove function
# test_string = tweets_train['tweet'][1582]
# print(test_string)
# print(url_remove(test_string))

> Wooohooo!!!

#### Now it's location time

In [20]:
# madrid_tweet = tweets_train['tweet'][14451]
# geo = GeoText(madrid_tweet)
# geo.cities

In [21]:
#Clean up tweet column, remove the "@word" from the rest of the tweet
tweets_train['tweet_clean'] = tweets_train['tweet'].apply(remove_airline)

#Clean up tweet column, remove the hashtags from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_hashtag)

#Clean up tweet column, remove code-like elements from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_code)

#Clean up tweet column, remove urls from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_url)

#Clean up tweet column, remove locations from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_location)

#Clean up tweet column, remove month names from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_month)

#Clean up tweet column, remove emojis from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_emoji)

#Clean up tweet column, remove airport codes and names from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_airport)

#### Mini moment of truth

In [22]:
#Let's make sure it works (indexes to test = 1, 3, 1582, 12805, 14451)
print(tweets_train['tweet'][1], '\n')
print(tweets_train['tweet_clean'][1], '\n')

print(tweets_train['tweet'][3], '\n')
print(tweets_train['tweet_clean'][3], '\n')

print(tweets_train['tweet'][1582], '\n')
print(tweets_train['tweet_clean'][1582], '\n')

print(tweets_train['tweet'][12805], '\n')
print(tweets_train['tweet_clean'][12805], '\n')

print(tweets_train['tweet'][14451], '\n')
print(tweets_train['tweet_clean'][14451], '\n')


@JetBlue is REALLY getting on my nerves !! 😡😡 #nothappy 

 is REALLY getting on my nerves !!   

@united the we got into the gate at IAH on time and have given our seats and closed the flight. If you know people is arriving, have to wait 

 the we got into the gate at  on time and have given our seats and closed the flight. If you know people is arriving, have to wait 

@SouthwestAir took delivery of N8661A, a new Boeing 737-8H4 yesterday. http://t.co/5z9STyUQJ3 #DFW #DAL #airlines 

 took delivery of , a new Boeing - yesterday.     

@united lots of reports of system failures delaying flights over the last week. Currently sitting on the tarmac at OGG for over an hour. 

 lots of reports of system failures delaying flights over the last week. Currently sitting on the tarmac at  for over an hour. 

@AmericanAir 767 seconds from touchdown at Madrid airport in April 2013 #AvGeek http://t.co/1yWXRfn0Gr 

  seconds from touchdown at  airport in     



In [23]:
tweets_train.head(10)

Unnamed: 0,airline,retweet_count,tweet,sentiment_TextBlob,polarity_TextBlob,sentiment_Vader,polarity_Vader,tweet_clean
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...,neutral,0.140693,positive,0.3182,'s new seeks the right balance to please pass...
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...,positive,0.3125,negative,-0.2462,is REALLY getting on my nerves !!
2,United,0,@united yes. We waited in line for almost an h...,negative,-0.125,positive,0.4019,yes. We waited in line for almost an hour to ...
3,United,0,@united the we got into the gate at IAH on tim...,negative,-0.1,neutral,0.0,the we got into the gate at on time and have...
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...,neutral,0.175,positive,0.3182,"its cool that my bags take a bit longer, dont..."
5,United,0,@united and don't hope for me having a nicer f...,neutral,0.080357,neutral,0.1265,and don't hope for me having a nicer flight s...
6,United,0,@united I like delays less than you because I'...,negative,-0.166667,positive,0.3612,I like delays less than you because I'm the o...
7,United,0,"@united, link to current status of flights/air...",positive,0.4,negative,-0.4019,", link to current status of flights/airports? ..."
8,Southwest,0,@SouthwestAir you guys there? Are we on hour 2...,neutral,0.0,neutral,0.0,you guys there? Are we on hour of our phone ...
9,United,0,@united I tried 2 DM it would not go thru... n...,negative,-0.25,negative,-0.2411,I tried DM it would not go thru... not sure why


### I'm only really interested in negative tweets

In [24]:
final_sentiment = []
for row in tweets_train.iterrows():
    index = row[0]
    if tweets_train['sentiment_TextBlob'][index] == 'negative' or tweets_train['sentiment_Vader'][index] == 'negative':
        final_sentiment.append('negative')
    elif tweets_train['sentiment_TextBlob'][index] == 'neutral' or tweets_train['sentiment_Vader'][index] == 'neutral':
        final_sentiment.append('neutral')
    else:
        final_sentiment.append('positive')

tweets_train['sentiment'] = final_sentiment
tweets_train.head()

Unnamed: 0,airline,retweet_count,tweet,sentiment_TextBlob,polarity_TextBlob,sentiment_Vader,polarity_Vader,tweet_clean,sentiment
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...,neutral,0.140693,positive,0.3182,'s new seeks the right balance to please pass...,neutral
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...,positive,0.3125,negative,-0.2462,is REALLY getting on my nerves !!,negative
2,United,0,@united yes. We waited in line for almost an h...,negative,-0.125,positive,0.4019,yes. We waited in line for almost an hour to ...,negative
3,United,0,@united the we got into the gate at IAH on tim...,negative,-0.1,neutral,0.0,the we got into the gate at on time and have...,negative
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...,neutral,0.175,positive,0.3182,"its cool that my bags take a bit longer, dont...",neutral


In [25]:
#Let's filter only on negative tweets (since that is our constructive criticism (or hopefully so))
neg_tweets = tweets_train[tweets_train['sentiment'] == 'negative']
neg_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5192 entries, 1 to 14480
Data columns (total 9 columns):
airline               5192 non-null object
retweet_count         5192 non-null int64
tweet                 5192 non-null object
sentiment_TextBlob    5192 non-null object
polarity_TextBlob     5192 non-null float64
sentiment_Vader       5192 non-null object
polarity_Vader        5192 non-null float64
tweet_clean           5192 non-null object
sentiment             5192 non-null object
dtypes: float64(2), int64(1), object(6)
memory usage: 405.6+ KB


In [26]:
neg_tweets.drop(columns=['sentiment_TextBlob', 'polarity_TextBlob', 'sentiment_Vader', 'polarity_Vader'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
neg_tweets.head()

Unnamed: 0,airline,retweet_count,tweet,tweet_clean,sentiment
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...,is REALLY getting on my nerves !!,negative
2,United,0,@united yes. We waited in line for almost an h...,yes. We waited in line for almost an hour to ...,negative
3,United,0,@united the we got into the gate at IAH on tim...,the we got into the gate at on time and have...,negative
6,United,0,@united I like delays less than you because I'...,I like delays less than you because I'm the o...,negative
7,United,0,"@united, link to current status of flights/air...",", link to current status of flights/airports? ...",negative


## Saving (just in case)

In [28]:
#Pickle the dataset, just in case 
with open('negative_tweets.pkl', 'wb') as picklefile:
    pickle.dump(neg_tweets, picklefile)