In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [308]:
#Essentials
import numpy as np
import pandas as pd
import pickle
import re

#Plotting fun
import matplotlib.pyplot as plt
import seaborn as sns

#NLP fun
import nltk
from  collections  import namedtuple


#Text cleaning (stemming, lemmatizing, etc.)
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import inflection
from autocorrect import spell


# from nltk.tokenize import TreebankWordTokenizer
# from nltk.tokenize import wordpunct_tokenize
# from nltk.chunk import ne_chunk
# from nltk.corpus import treebank_chunk
# from operator import itemgetter

# Dimentionality reduction
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation#, PCA 

#Vectorizing
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 


#NLP modelling
# from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.neighbors import NearestNeighbors
# from sklearn.metrics import pairwise_distances
# from sklearn.metrics import silhouette_score
# from sklearn.datasets import fetch_mldata


# from sklearn.cluster import AgglomerativeClustering
# from sklearn.neighbors import kneighbors_graph
# from scipy.cluster.hierarchy import dendrogram, linkage
# from sklearn.cluster import SpectralClustering
# from mpl_toolkits.mplot3d import Axes3D

#from gensim import corpora, models, similarities, matutils, Word2vec, emoji2vec

# #Dim reduction for visualization
# from sklearn.manifold import TSNE


#Neural nets
# import torch
# import torch.nn as nn
# from torch.autograd import Variable
# import torch.nn.functional as F

#Model testing
# from sklearn import metrics

#Nice to have
# from collections import Counter

%matplotlib inline

### Helper functions

In [370]:
def simple_pos(pos):
    if pos == 'NOUN':
        return 'n'
    elif pos == 'VERB':
        return 'v'
    elif pos == 'ADJ':
        return 'a'
    elif pos == 'ADV':
        return 'r'
    elif pos == 'ADJ_SAT':
        return 's'
    else:
        return 'n'


def process_words(string):
    """Takes a string as input.
    Returns the string after spell correcting, lemmatizing and singularizing the words."""
    processed_string = string
    lmtzr = WordNetLemmatizer()
    word_pos_pair = pos_tag(word_tokenize(processed_string), tagset='universal')
    for word, pos in word_pos_pair:
        word_pos = simple_pos(pos)
        spell_correct = ' '+spell(word) if "'" in word else spell(word)
        lame_word = lmtzr.lemmatize(spell_correct, word_pos)
        singular_word = inflection.singularize(lame_word)
        processed_string = processed_string.replace(word, singular_word)
    return processed_string 
    
    
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [372]:
#Testing
print(process_words("flying brilliant flightled"))
print(process_words("bleeming cancelledr flights"))
print(process_words("I like delays less than you because don't the o..."))

fly brilliant flight
bleed cancelled flight
I like delay less than you because do not the o...


In [262]:
#Moving forward read from pickle rather than repeat above steps
with open('negative_tweets.pkl', 'rb') as picklefile:
    negative_tweets = pickle.load(picklefile)

In [5]:
negative_tweets = negative_tweets.drop(columns=['tweet', 'retweet_count', 'sentiment'])
negative_tweets.head()

Unnamed: 0,airline,tweet_clean
1,Delta,is REALLY getting on my nerves !!
2,United,yes. We waited in line for almost an hour to ...
3,United,the we got into the gate at on time and have...
6,United,I like delays less than you because I'm the o...
7,United,", link to current status of flights/airports? ..."


## Now, it's time to vectorize the tweets

In [257]:
#Some prep work (lemmatize)
tweet_col = negative_tweets['tweet_clean']
tweet_col = [lemmatize(tweet) for tweet in tweet_col]


In [260]:
tweet_col[:10]

[' be REALLY get on my nerve !!  ',
 ' yes. We wait in line for almost an hour to do so. Some passenger just leave not want to wait past be.',
 ' the we get into the gate at  on time and have give our seat and close the flight. If you know people be arrive, have to wait',
 " I like delay less than you because I'm the one on the plane. Connect me with a voucher",
 ", link to current status of flights/airports? Fly BWI-EWR-MCO this morning yet can't yet tell what any problem be except see snow.",
 ' I try  DM it would not go thru... not sure why',
 " i have item of sentimental value that I'm heartbroken be miss",
 ' We have be stick in  for several hour and no one be answer here. Really tough to  SW. No response be bad.',
 ' be officially the bad, most delay, and least helpful airline I have ever have the mbefortune of fly on',
 " won't answer their phone    "]

In [117]:
#Some prep work
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"', '!', ':', 'yes', 'no']
stop = set(stop)

#### Count Vectorizer method

In [118]:
params = {
            'strip_accents': 'ascii',
            'lowercase': True,
            'stop_words': stop, 
            'max_df': 0.5,
            'min_df': 10,
            'ngram_range': (1, 2)
         }

In [256]:
#Count vectorizer case
count_vec = CountVectorizer(**params)

negative_tweets_cv = count_vec.fit_transform(tweet_col)
negative_tweets_cv = negative_tweets_cv.toarray()
negative_tweets_cv = pd.DataFrame(negative_tweets_cv, columns=count_vec.get_feature_names())
negative_tweets_cv.head()

TypeError: expected string or bytes-like object

#### TF-IDF method

In [120]:
#TF-IDF 
tfidf_vec = TfidfVectorizer(**params)
negative_tweets_tfidf = tfidf_vec.fit_transform(tweet_col)
negative_tweets_tfidf = negative_tweets_tfidf.toarray()
negative_tweets_tfidf = pd.DataFrame(negative_tweets_tfidf, columns=tfidf_vec.get_feature_names())
negative_tweets_tfidf.head()

Unnamed: 0,aa,able,absolute,absolutely,acceptable,access,account,actually,add,advisory,...,wrong,wtf,yall,yeah,year,years,yesterday,yet,yr,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465505,0.0,0.0


## Dimensionality reduction fun

In [121]:
#Number of topics
n_topics = 10
n_words = 10
n_iter = 20

#### Different dimensionality reduction techniques applied CountVectorized data

In [122]:
#CountVectorizer + LSA
lsa_cv = TruncatedSVD(n_components=n_topics)
lsa_cv_data = lsa_cv.fit_transform(negative_tweets_cv)

In [123]:
print('Count vectorizer and PCA:')
display_topics(lsa_cv, count_vec.get_feature_names(), n_words)

Count vectorizer and PCA:

Topic  0
flight, cancelled, flightled, cancelled flightled, get, late, delayed, flight cancelled, late flight, hours

Topic  1
service, customer, customer service, worst, get, amp, ever, terrible, worst customer, flights

Topic  2
cancelled, cancelled flightled, flightled, flight cancelled, flights, flighted, cancelled flighted, flightled flight, hold, tomorrow

Topic  3
get, hours, plane, amp, time, late, us, still, back, gate

Topic  4
late, flightr, late flightr, hours, late flight, still, call, flightled, cancelled flightled, amp

Topic  5
get, late, late flight, flightr, late flightr, flightled, cancelled flightled, customer, customer service, service

Topic  6
flighted, cancelled flighted, amp, cancelled, call, late, flighted flight, problems, help, booking problems

Topic  7
hours, delayed, flighted, cancelled flighted, hold, cancelled, flights, flight delayed, get, flighted flight

Topic  8
amp, delayed, hours, flight delayed, bag, hold, get, help, de

In [124]:
#CountVectorizer + NMF
nmf_cv = NMF(n_components=n_topics)
nmf_cv_data = nmf_cv.fit_transform(negative_tweets_cv)

In [125]:
print('Count vectorizer and NMF:')
display_topics(nmf_cv, count_vec.get_feature_names(), n_words)

Count vectorizer and NMF:

Topic  0
flight, late flight, cancelled flight, problems, booking problems, booking, flight booking, flight flight, flight cancelled, next

Topic  1
service, customer, customer service, worst, ever, terrible, worst customer, poor, airline, bad

Topic  2
cancelled, cancelled flightled, flightled, flight cancelled, flightled flight, flights, hold, tomorrow, got, flightled flights

Topic  3
get, trying, home, trying get, get home, need, phone, back, way, bag

Topic  4
late, flightr, late flight, late flightr, hours, call, still, hours late, back, hrs

Topic  5
amp, bag, lost, help, day, still, phone, back, one, call

Topic  6
cancelled, flighted, cancelled flighted, flights, flight cancelled, hold, flighted flight, weather, cancelled flight, help

Topic  7
delayed, hours, flight delayed, flights, delayed hours, hold, due, delayed flight, ua, delayed due

Topic  8
plane, us, gate, hour, delay, waiting, sitting, one, agent, stuck

Topic  9
time, worst, airline, ev

In [126]:
#CountVectorizer + LDA
lda_cv = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=n_iter,
                                    random_state=42,
                                    learning_method='online')

lda_cv.fit_transform(negative_tweets_cv);

In [127]:
print('Count vectorizer and LDA:')
display_topics(lda_cv, count_vec.get_feature_names(), n_words)

Count vectorizer and LDA:

Topic  0
flight, going, us, call, back, way, got, miss, connection, need

Topic  1
phone, check, change, amp, online, travel, hrs, flight, told, system

Topic  2
people, hour, gate, flight, know, rude, agent, boarding, problem, please

Topic  3
still, waiting, hours, thanks, baggage, minutes, wait, hold, ua, bag

Topic  4
flight, cancelled, late, delayed, flightled, cancelled flightled, late flight, hours, flighted, cancelled flighted

Topic  5
flight, flights, cancelled, flying, tomorrow, united, cancelled flight, airlines, airport, tried

Topic  6
one, plane, take, us, trip, aa, flights, missed, response, times

Topic  7
service, customer, customer service, bad, weather, getting, ticket, missing, terrible, would

Topic  8
airline, never, flight, really, worst, problems, time, fly, like, ever

Topic  9
get, lost, trying, luggage, bag, stuck, day, one, amp, passengers


#### Different dimensionality reduction techniques applied TF-IDFed data

In [128]:
#LSA + TF-IDF
lsa_tfidf = TruncatedSVD(n_components=n_topics)
lsa_tfidf_data = lsa_tfidf.fit_transform(negative_tweets_tfidf)

In [129]:
print('TF-IDF and PCA:')
display_topics(lsa_tfidf, tfidf_vec.get_feature_names(), n_words)

TF-IDF and PCA:

Topic  0
flight, cancelled, cancelled flightled, flightled, get, service, delayed, flight cancelled, customer, hours

Topic  1
service, customer, customer service, worst, ever, worst customer, terrible, airline, never, bad

Topic  2
cancelled, cancelled flightled, flightled, customer, customer service, service, flight cancelled, flightled flight, flights, flighted

Topic  3
flight, late flight, late, customer service, delayed, customer, service, flight delayed, cancelled flight, booking problems

Topic  4
worst, airline, ever, delayed, worst airline, never, time, flights, airline ever, experience

Topic  5
delayed, plane, flight delayed, cancelled flightled, flightled, gate, delay, hours, hour, delayed flight

Topic  6
flighted, cancelled flighted, problems, booking problems, booking, hold, delayed, hour, flight booking, get

Topic  7
flighted, cancelled flighted, hours, hold, late, flights, cancelled, plane, flightr, late flightr

Topic  8
plane, gate, delay, hour, si

In [130]:
#LDA + TF-IDF
lda_tfidf = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=n_iter,
                                    random_state=42,
                                    learning_method='online')

lda_tfidf_data = lda_tfidf.fit_transform(negative_tweets_tfidf);

In [131]:
print('TF-IDF and LDA:')
display_topics(lda_tfidf, tfidf_vec.get_feature_names(), n_words)

TF-IDF and LDA:

Topic  0
call, hold, help, seriously, someone, ridiculous, hour, back, lost, right

Topic  1
sorry, well, system, always, wrong, broken, way, fee, customers, nothing

Topic  2
customer, service, customer service, terrible, agents, response, care, sent, dm, line

Topic  3
gate, flight, time, still, thanks, flt, crew, rebooked, get, delay

Topic  4
flight, flighted, cancelled flighted, delayed, miss, cancelled, pay, flight delayed, made, seat

Topic  5
worst, airline, ever, flying, never, experience, aa, rude, flights, disappointed

Topic  6
plane, waiting, ua, sitting, bad, tried, people, still, sucks, mins

Topic  7
late, late flight, flight, flightr, late flightr, mean, miles, everyone, point, idea

Topic  8
problems, flight, booking, booking problems, leave, website, due, delay, long, weather

Topic  9
flight, cancelled, cancelled flightled, flightled, get, tomorrow, stuck, flight cancelled, flights, days


In [132]:
#Using Word2vec with google's pre-trianed vectors
# import os

# google_vec_file = '~/Downloads/GoogleNews-vectors-negative300.bin'
# w2v_google = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

In [133]:
# class RecommendationEngine:
    
#     def __init__(self, vectorizer, n_components, reducer):
#         self.vectorizer = vectorizer
#         self.n_dim = n_components
#         self.reducer = reducer(n_components)
        
#     def fit(self, text):
#         self.vector_data = self.vectorizer.fit_transform(text)
#         self.topic_data = self.reducer.fit_transform(self.vector_data)
#         self.text = text
        
#     def recommend(self, article, num_to_return):