## Data Exploration, Cleanup and Analysis

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))


In [12]:
#Essentials
import numpy as np
import pandas as pd

#SQL related
import sqlite3
import pandas.io.sql as pd_sql

#Plotting fun
import matplotlib.pyplot as plt

#Nice to have
import seaborn as sns
import re

#NLP modules
from geotext import GeoText
from nltk.tokenize import wordpunct_tokenize
from nltk.tag import pos_tag

%matplotlib inline

In [3]:
#Setting up for working with SQLite database
sqlite_file = '/Users/auste_m/ds/metis/metisgh/github/metis_projects/Customer_Review_Sentiment_Analysis/Datasets/twitter-airline-sentiment/database.sqlite'

conn = sqlite3.connect(sqlite_file)
cursor = conn.cursor()

In [4]:
#Check one of the rows in the table
preview = cursor.execute("SELECT * FROM Tweets LIMIT 20")
columns = [column[0] for column in preview.description]
print('The columns of the table are:' + ' \n' + str(columns) + '\n')
print('Preview of one of the rows in the table:' + '\n' + str(preview.fetchone()))

The columns of the table are: 
['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']

Preview of one of the rows in the table:
(567588278875213824, 'neutral', 1, '', '', 'Delta', '', 'JetBlueNews', '', 0, "@JetBlue's new CEO seeks the right balance to please passengers and Wall ... - Greenfield Daily Reporter http://t.co/LM3opxkxch", '', '2015-02-16 23:36:05 -0800', 'USA', 'Sydney')


In [5]:
#Retrieve relevant information from Tweets table in SQLite database and store them in a pandas dataframe
query = """SELECT airline, retweet_count, text as 'tweet' 
            FROM Tweets"""


tweets_df = pd.read_sql_query(query, conn)

In [6]:
#Sanity check
print(tweets_df.info())
tweets_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14485 entries, 0 to 14484
Data columns (total 3 columns):
airline          14485 non-null object
retweet_count    14485 non-null int64
tweet            14485 non-null object
dtypes: int64(1), object(2)
memory usage: 339.6+ KB
None


Unnamed: 0,airline,retweet_count,tweet
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...
2,United,0,@united yes. We waited in line for almost an h...
3,United,0,@united the we got into the gate at IAH on tim...
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...


In [7]:
#Clean up tweet column, separate the "@word" from the rest of the tweet
pattern = re.compile('@[A-Za-z]+\w')
tweets_df['cleaner_tweet'] = [re.sub(pattern.search(tweet).group(), '', tweet) for tweet in tweets_df['tweet']]

In [8]:
#Sanity check
# print(tweets_df['cleaner_tweet'][12500:12505])
# print(tweets_df['tweet'][12500:12505])

In [9]:
tweets_df.head()

Unnamed: 0,airline,retweet_count,tweet,cleaner_tweet
0,Delta,0,@JetBlue's new CEO seeks the right balance to ...,'s new CEO seeks the right balance to please p...
1,Delta,0,@JetBlue is REALLY getting on my nerves !! 😡😡 ...,is REALLY getting on my nerves !! 😡😡 #nothappy
2,United,0,@united yes. We waited in line for almost an h...,yes. We waited in line for almost an hour to ...
3,United,0,@united the we got into the gate at IAH on tim...,the we got into the gate at IAH on time and h...
4,Southwest,0,@SouthwestAir its cool that my bags take a bit...,"its cool that my bags take a bit longer, dont..."


In [None]:
# class RecommendationEngine:
    
#     def __init__(self, vectorizer, n_components, reducer):
#         self.vectorizer = vectorizer
#         self.n_dim = n_components
#         self.reducer = reducer(n_components)
        
#     def fit(self, text):
#         self.vector_data = self.vectorizer.fit_transform(text)
#         self.topic_data = self.reducer.fit_transform(self.vector_data)
#         self.text = text
        
#     def recommend(self, article, num_to_return):