## Data Exploration, Cleanup and Analysis

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))


In [2]:
#Essentials
import numpy as np
import pandas as pd

#SQL related
import sqlite3
import pandas.io.sql as pd_sql

#API related
import requests

#Plotting fun
import matplotlib.pyplot as plt

#Nice to have
import seaborn as sns
import re
from calendar import month_name

#NLP modules
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import treebank_chunk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import gensim
from nltk.util import ngrams
from collections import Counter
from operator import itemgetter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import scale
from sklearn.datasets import fetch_mldata
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN, SpectralClustering, MeanShift
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import SpectralClustering
from mpl_toolkits.mplot3d import Axes3D
from gensim import corpora, models, similarities, matutils
from geotext import GeoText

from sklearn.manifold import TSNE

#Neural nets
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

%matplotlib inline

In [3]:
#Setting up for working with SQLite database
sqlite_file = '/Users/auste_m/ds/metis/metisgh/github/metis_projects/Customer_Review_Sentiment_Analysis/Datasets/twitter-airline-sentiment/database.sqlite'

conn = sqlite3.connect(sqlite_file)
cursor = conn.cursor()

In [4]:
#Check one of the rows in the table
preview = cursor.execute("SELECT * FROM Tweets LIMIT 20")
columns = [column[0] for column in preview.description]
print('The columns of the table are:' + ' \n' + str(columns) + '\n')
print('Preview of one of the rows in the table:' + '\n' + str(preview.fetchone()))

The columns of the table are: 
['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']

Preview of one of the rows in the table:
(567588278875213824, 'neutral', 1, '', '', 'Delta', '', 'JetBlueNews', '', 0, "@JetBlue's new CEO seeks the right balance to please passengers and Wall ... - Greenfield Daily Reporter http://t.co/LM3opxkxch", '', '2015-02-16 23:36:05 -0800', 'USA', 'Sydney')


In [5]:
#Retrieve relevant information from Tweets table in SQLite database and store them in a pandas dataframe
query = """SELECT airline, retweet_count, text as 'tweet' 
            FROM Tweets"""


tweets_df = pd.read_sql_query(query, conn)

In [6]:
#Sanity check
print(tweets_df.info())
tweets_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14485 entries, 0 to 14484
Data columns (total 3 columns):
airline          14485 non-null object
retweet_count    14485 non-null int64
tweet            14485 non-null object
dtypes: int64(1), object(2)
memory usage: 339.6+ KB
None


Unnamed: 0,airline,retweet_count,tweet
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...
2,United,0,@united yes. We waited in line for almost an h...
3,United,0,@united the we got into the gate at IAH on tim...
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...


### Put aside a test set

In [7]:
tweets_test = tweets_df[10000:12000]
tweets_train = tweets_df[:10000]
tweets_train = tweets_train.append(tweets_df[12000:])

In [8]:
# tweets_train.info()
# tweets_test.info()

#### Let's gather airport information from an external API

In [9]:
#First need to get global airport database through an API request 
airport_db_url = 'https://aviation-edge.com/api/public/airportDatabase?key=42e87b-a2f1be-c446fa-06d7a2-012f14'
get_response = requests.get(airport_db_url)
airport_db = get_response.json()

#Then I need to extract the information that is relevant to me (airport names and codes)
airport_info = []

for airport in airport_db:
    airport_info.append(airport['codeIataAirport'])
    airport_info.append(airport['nameAirport'])

#Test that results make sense
# if 'IAH' in airport_codes:
#     print(airport_db[airport_codes.index('IAH')])
# else:
#     print(False)

In [10]:
len(airport_info)

20102

### Let's have some regex fun!

In [11]:
#helper function to remove stuff from tweets

def remove_airline(string):
    """Takes a string as input.
    Returns the same string with hashtag removed."""
    pattern1 = re.compile('@[A-Za-z]+\w')
    new_string = string
    try:
        all_airlines = pattern1.findall(new_string)
        for airline in all_airlines:
            new_string = re.sub(airline, '', new_string)
    except:
        pass
    return new_string

def remove_hashtag(string):
    """Takes a string as input.
    Returns the same string with hashtag removed."""
    pattern2 = re.compile('#\w+')
    new_string = string
    try:
        all_hashtags = pattern2.findall(new_string)
        for hashtag in all_hashtags:
            new_string = re.sub(hashtag, '', new_string)
    except:
        pass
    return new_string


def remove_code(string):
    """Takes a string as input.
    Returns the same string with any capital letter & digit combination text removed."""
    pattern3 = re.compile('[A-Z]?\d+[A-Z]+')
    pattern4 = re.compile('\d+')
    new_string = string
    try:
        codes = pattern3.findall(new_string)
        codes.extend(pattern4.findall(new_string))
        for elem in codes:
            new_string = re.sub(elem, '', new_string)
    except:
        pass
    return new_string
    
    
def remove_url(string):
    """Takes a string as input.
    Returns the same string with any urls removed removed."""
    pattern5 = re.compile('http://t.co/\w+')
    new_string = string
    try:
        urls = pattern5.findall(new_string)
        for url in urls:
            new_string = re.sub(url, '', new_string)
    except:
        pass
    return new_string


def remove_location(string):
    """Takes a string as input.
    Returns a new string with location information removed."""
    new_string = string
    geo_loc = GeoText(string)
    locations = []
    if geo_loc.cities != []:
        locations.extend(geo_loc.cities)
    if geo_loc.countries != []:
        locations.extend(geo_loc.countries)
    try:
        for loc in locations:
            new_string = re.sub(loc, '', new_string)
    except:
        pass
    return new_string
    
    
def remove_month(string):
    """Takes a string as input.
    Returns a new string with month information removed."""
    new_string = string
    all_months = month_name[1:]
    try:
        for word in string.split():
            if word in all_months:
                new_string = re.sub(word, '', new_string)
            else:
                continue
    except:
        pass
    return new_string


def remove_emoji(string):
    """Takes a string as input.
    Returns a new string with emojis removed."""    
    pattern6 = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    new_string = string
    try:
        emojis = pattern6.findall(new_string)
        for emoji in emojis:
            new_string = re.sub(emoji, '', new_string)
    except:
        pass
    return new_string

def remove_airport(string):
    """Takes a string as input.
    Returns a new string with airport codes removed.""" 
    new_string = string
    try:
        for word in string.split():
            if word in airport_info:
                new_string = re.sub(word, '', new_string)
            else:
                continue
    except:
        pass
    return new_string


#### Examining hashtag containing tweets

In [12]:
# #Identify all the tweets containing hashtags
# pattern2 = re.compile('#[A-Za-z]+\w')
# count_hash_tweets = 0

# for index, tweet in enumerate(tweets_train['tweet']):
#     try:
#         h_tweet = pattern2.search(tweet).group()
#         print(index, h_tweet)
#         count_hash_tweets += 1
#     except:
#         continue
        
# print('\nTotal number of tweets containing hashtags =', str(count_hash_tweets))

#### Same for urls

In [13]:
# # Find url pattern
# https_list = []

# for tweet in tweets_train['tweet']:
#     if url_remove(tweet) == []:
#         pass
#     else:
#         https_list.append(url_remove(tweet))
        
# print(https_list)

In [14]:
# #Testing code_remove function
# test_string = tweets_train['tweet'][1582]
# print(test_string)
# print(url_remove(test_string))

> Wooohooo!!!

#### Now it's location time

In [15]:
# madrid_tweet = tweets_train['tweet'][14451]
# geo = GeoText(madrid_tweet)
# geo.cities

### Let's get sweeping

In [None]:
#Clean up tweet column, remove the "@word" from the rest of the tweet
tweets_train['tweet_clean'] = tweets_train['tweet'].apply(remove_airline)

#Clean up tweet column, remove the hashtags from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_hashtag)

#Clean up tweet column, remove code-like elements from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_code)

#Clean up tweet column, remove urls from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_url)

#Clean up tweet column, remove locations from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_location)

#Clean up tweet column, remove month names from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_month)

#Clean up tweet column, remove emojis from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_emoji)

#Clean up tweet column, remove airport codes and names from all tweets
tweets_train['tweet_clean'] = tweets_train['tweet_clean'].apply(remove_airport)

#### Mini moment of truth

In [None]:
#Let's make sure it works (indexes to test = 1, 3, 1582, 12805, 14451)
print(tweets_train['tweet'][1], '\n')
print(tweets_train['tweet_clean'][1], '\n')

print(tweets_train['tweet'][3], '\n')
print(tweets_train['tweet_clean'][3], '\n')

print(tweets_train['tweet'][1582], '\n')
print(tweets_train['tweet_clean'][1582], '\n')

print(tweets_train['tweet'][12805], '\n')
print(tweets_train['tweet_clean'][12805], '\n')

print(tweets_train['tweet'][14451], '\n')
print(tweets_train['tweet_clean'][14451], '\n')


In [None]:
tweets_train.head(10)

In [None]:
# class RecommendationEngine:
    
#     def __init__(self, vectorizer, n_components, reducer):
#         self.vectorizer = vectorizer
#         self.n_dim = n_components
#         self.reducer = reducer(n_components)
        
#     def fit(self, text):
#         self.vector_data = self.vectorizer.fit_transform(text)
#         self.topic_data = self.reducer.fit_transform(self.vector_data)
#         self.text = text
        
#     def recommend(self, article, num_to_return):