# Importing the needed Libraries 

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import wikipedia
import spacy
from textblob import TextBlob

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# NLP Techniques Lab

In this lab, we'll be practicing a set of advanced NLP techniques using tweets on airline satisfaction ([originally from Kaggle](https://www.kaggle.com/crowdflower/twitter-airline-sentiment/data)).

The first section asks you to perform LDA on the dataset to summarize the body of tweets. The second section will focus on using this data to predict the sentiment of a given tweet.

Import the data as follows:

In [6]:
import pandas as pd

df = pd.read_csv('datasets/Tweets.csv')
print(df.shape)
df.head()

(14640, 15)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


Use this data to do the following:

#### 1. Use LDA to identify topics in the tweets

Pick a number of topics between 5-20 and use LDA to summarize the corpus of tweets. Print out the top 25 most frequently occuring words in each topic. Do the topics appear cohesive to you? What predominant trends can you find?

In [15]:
tf = TfidfVectorizer(stop_words='english')
X = df['text'].values
X_fit = tf.fit(X)
X_transform = X_fit.transform(X)
X_transform

<14640x14770 sparse matrix of type '<class 'numpy.float64'>'
	with 133911 stored elements in Compressed Sparse Row format>

In [19]:
feature_names = tf.get_feature_names()
lda = LatentDirichletAllocation(n_topics=10)
lda.fit(X_transform)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [23]:
print (lda.components_.shape)

(10, 14770)


In [27]:
results = pd.DataFrame(lda.components_,
                      columns=feature_names)
results.head()

Unnamed: 0,00,000,000114,000419,000ft,000lbs,0011,0016,00a,00am,...,zrh_airport,zsdgzydnde,zsuztnaijq,ztrdwv0n4l,zukes,zurich,zv2pt6trk9,zv6cfpohl5,zvfmxnuelj,zzps5ywve2
0,0.100036,9.584669,0.1,0.1,0.1,0.100028,0.1,0.1,0.1,0.100006,...,0.1,0.1,0.100014,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,4.988719,0.100024,0.1,0.100016,0.10001,0.100057,0.1,0.1,0.100221,0.100012,...,0.1,0.1,0.100003,0.100002,0.100006,0.100007,0.1,0.100023,0.100025,0.100084
2,0.100012,0.100021,0.1,0.1,0.10011,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.100001,0.1,0.1,0.500505,0.1
3,0.100006,0.100009,0.100009,0.403605,0.10001,0.1,0.30622,0.63934,0.1,0.1,...,0.100001,0.1,0.1,0.1,0.1,0.100002,0.100005,0.1,0.10001,0.1
4,0.10001,0.100057,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.100001,...,0.1,0.1,0.344949,0.1,0.1,0.100003,0.1,0.1,0.100027,0.450041


In [49]:
for topic in range(10):
    print('Topic', topic)
    word_list = results.T[topic].sort_values(ascending=False).index
    print (word_list[0:25])
#     print(' '.join(word_list[0:25]), '\n')

Topic 0
Index(['reservation', 'charlotte', 'busy', 'employees', 'attendants',
       'appreciated', 'provide', 'passenger', 'record', 'flown', 'haha',
       'water', 'friendly', 'attitude', 'services', 'warm', '000', 'lovely',
       'unhelpful', 'smh', 'trust', 'class', 'lie', 'level', 'upgrades'],
      dtype='object')
Topic 1
Index(['americanair', 'usairways', 'flight', 'southwestair', 'united',
       'jetblue', 'thanks', 'cancelled', 'help', 'service', 'just', 'hold',
       'hours', 'customer', 'time', 'plane', 'flights', 'flightled', 'need',
       'amp', 've', 'delayed', 'hour', 'phone', 'got'],
      dtype='object')
Topic 2
Index(['hoping', 'yeah', 'years', 'completely', 'price', 'letting', 'control',
       'mind', 'fare', 'round', 'depart', 'useless', 'bit', 'auto', 'btw',
       'space', 'worth', 'center', 'select', 'south', 'web', 'oh', '2hrs',
       'mention', 'sister'],
      dtype='object')
Topic 3
Index(['info', 'booking', 'able', 'say', 'voucher', 'problems', 'send'

In [50]:
type(word_list)

pandas.core.indexes.base.Index

#### Bonus LDA Question (Tackle if you have time / interest)

Using the `.transform()` method on LDA on the data you fed it will return back a numpy array of shape `(n_rows, n_topics)`. The value in each column will identify the probability that the row in question belongs to that topic. For example, if we were looking at a row of data and an LDA model for three topics, we might see the following:

```python
lda.transform(row_of_data)
>> [[ 0.02, 0.97, 0.01 ]]
```

This would suggest that for that row of data, it is most likely to be in the second topic (compared to the first or third topic).

As a bonus challenge, try the two following questions:

1. For each topic, which tweet most exemplifies (or is most likely to belong to that topic?)
2. Find a recent tweet at an airline that you have used. Can you use the model you have currently to identify what topic does it belongs to?

# For each topic, which tweet most exemplifies (or is most likely to belong to that topic?)

In [97]:
'''Setting up the basic Vectorizer'''

tf = TfidfVectorizer(stop_words='english')
X = df['text'].values
X_fit = tf.fit(X)
X_transform = X_fit.transform(X)

'''Performing the LDA stuff '''
feature_names = tf.get_feature_names()
lda = LatentDirichletAllocation(n_topics=10)
lda.fit(X_transform)

lda_result = lda.transform(X_transform)



In [98]:
print ('Type of LDA Result: ', type(lda_result))
print (lda_result[0:10])

Type of LDA Result:  <class 'numpy.ndarray'>
[[ 0.03808219  0.03808219  0.03808219  0.03808219  0.03808219  0.44828122
   0.03808988  0.24705178  0.03808399  0.03808219]
 [ 0.02827486  0.02827486  0.02827486  0.02827486  0.02827486  0.02827554
   0.45925998  0.31454013  0.02827517  0.02827486]
 [ 0.02923226  0.18219171  0.02923226  0.02923226  0.02923226  0.02923406
   0.02923631  0.58394247  0.02923306  0.02923333]
 [ 0.02382466  0.02382466  0.02382466  0.02382466  0.02382466  0.02382491
   0.70064659  0.10875576  0.02382477  0.02382466]
 [ 0.03111752  0.03111752  0.03111752  0.03111752  0.03111752  0.03111974
   0.03112173  0.56078201  0.03112114  0.19026781]
 [ 0.02229491  0.02229491  0.02229491  0.02229491  0.02229491  0.02229565
   0.02229579  0.70608853  0.1155506   0.02229491]
 [ 0.24699469  0.02492648  0.02492648  0.02492648  0.0249267   0.02492726
   0.02492816  0.55359044  0.02492682  0.02492648]
 [ 0.02463456  0.02463456  0.02463456  0.54148128  0.11488166  0.02463873
   0.0

In [99]:
topics = ['Topic'+ str(i+1) for i in range (10)]

results = pd.DataFrame(lda_result, columns=topics)
results.head()

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10
0,0.038082,0.038082,0.038082,0.038082,0.038082,0.448281,0.03809,0.247052,0.038084,0.038082
1,0.028275,0.028275,0.028275,0.028275,0.028275,0.028276,0.45926,0.31454,0.028275,0.028275
2,0.029232,0.182192,0.029232,0.029232,0.029232,0.029234,0.029236,0.583942,0.029233,0.029233
3,0.023825,0.023825,0.023825,0.023825,0.023825,0.023825,0.700647,0.108756,0.023825,0.023825
4,0.031118,0.031118,0.031118,0.031118,0.031118,0.03112,0.031122,0.560782,0.031121,0.190268


In [100]:
print ('Results Shape: ' , results.shape)
print ('Core DF Shape: ' , df.shape)

Results Shape:  (14640, 10)
Core DF Shape:  (14640, 15)


In [101]:
results.sort_values('Topic1', ascending=False).index[0:2]

Int64Index([76, 388], dtype='int64')

# Find a recent tweet at an airline that you have used. Can you use the model you have currently to identify what topic does it belongs to?

In [102]:
import pprint 

topic_dict = {}
topic_detail_dict = {}

for i in range(1,11): 
    new_i = 'Topic'+ str(i)
    topic_dict[new_i] = results.sort_values(new_i, ascending=False).index[0]
    topic_detail_dict[new_i] = df.iloc[results.sort_values(new_i, ascending=False).index[0]]['text']
    

    
pprint.pprint (topic_dict)
pprint.pprint (topic_detail_dict)

{'Topic1': 76,
 'Topic10': 855,
 'Topic2': 1851,
 'Topic3': 3073,
 'Topic4': 8164,
 'Topic5': 12463,
 'Topic6': 1227,
 'Topic7': 253,
 'Topic8': 826,
 'Topic9': 87}
{'Topic1': '@VirginAmerica Or watch some of the best student films in the '
           'country at 35,000 feet! #CMFat35000feet http://t.co/KEK5pDMGiF',
 'Topic10': '@united http://t.co/hj5kq82Chn, however, is completely under your '
            'control—the price was and still is displayed on '
            'http://t.co/hj5kq82Chn.',
 'Topic2': '@united This is probably the least dependable airline in the '
           'Western Hemisphere. @united does not belong in Star Alliance, but '
           'SkyTeam',
 'Topic3': '@united  oh united, how much I despise thee!',
 'Topic4': '@JetBlue 108 to Portland Maine',
 'Topic5': '@AmericanAir Super Spring Tides and “Tide of The Century” Drawing '
           'tourists to French and U.K coasts:\n'
           'http://t.co/gXdqORtsS0',
 'Topic6': '@United to operated #B767-300ER from #N

#### 2. Use NLP to predict the sentiment of tweets

In this section, please use any of the NLP techniques that we have covered over the last two days to best predict whether a tweet has a negative sentiment or not. Transformation code for your target variable is below.

**Bonus Consideration**: Outside of the text itself, do other factors in the dataset have an effect? Do your results change if you include features like the airline or the timezone of the tweet?

Don't forget to create a training and test set to compare your results. 

In [103]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [104]:
df.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [105]:
df['negative'] = df['airline_sentiment'].apply(lambda x: 1 if x =='negative' else 0)

In [107]:
'''Setting up the Targets and Features '''


X = df['text']
y = df['negative']


X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)


'''Setting up the TFIDVectorizer'''
tf = TfidfVectorizer(stop_words='english')

'''Intantiating Logreg'''
logreg = LogisticRegression()


'''Setting up the Parameters for GridSearch'''
params = {

    'logreg__penalty': ['l1', 'l2'], 
    'logreg__C': [1.0,10,100], 
    'logreg__max_iter': [100,150,200]   
}

'''Setting the Pipeline'''
logreg_tk_pipe = Pipeline([('vect', tf), 
                     ('logreg', logreg)])

'''Fitting the Model on Training'''
gs_logreg = GridSearchCV(logreg_tk_pipe, param_grid=params,n_jobs = -1, verbose=2, scoring='accuracy')
gs_logreg.fit(X_train, y_train)


print ('Best Params: ' , gs_logreg.best_params_) 
print ('Best Score: ', gs_logreg.best_score_)
print ('Train Score: ',gs_logreg.score(X_train, y_train))
print ('Test Score: ', gs_logreg.score(X_test, y_test))
print ('Confusion Matrix on Train \n')
print (confusion_matrix(y_train, gs_logreg.predict(X_train)))
print ('Classification Report on Train\n')
print (classification_report(y_train, gs_logreg.predict(X_train)))
print ('\n')
print ('Confusion Matrix on Test \n')
print (confusion_matrix(y_test, gs_logreg.predict(X_test)))
print ('\n')
print ('Classification Report on Test\n')
print (classification_report(y_test, gs_logreg.predict(X_test)))

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.6s
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.6s
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.6s
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV] logreg__C=1.0, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2, total=   0.7s
[CV] logreg__C=1.0, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2, 

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.3s


[CV]  logreg__C=10, logreg__max_iter=200, logreg__penalty=l2, total=   1.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l1 .........
[CV]  logreg__C=10, logreg__max_iter=200, logreg__penalty=l2, total=   1.2s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l1 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   0.9s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=10, logreg__max_iter=200, logreg__penalty=l2, total=   1.5s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   0.9s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   1.1s
[CV] logreg__C=100, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l2, total=   1.3s
[CV] logreg__C=100, logreg__max_iter=1

[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   17.5s finished


Best Params:  {'logreg__C': 10, 'logreg__max_iter': 100, 'logreg__penalty': 'l2'}
Best Score:  0.820782103825
Train Score:  0.961919398907
Test Score:  0.814890710383
Confusion Matrix on Train 

[[4086  288]
 [ 158 7180]]
Classification Report on Train

             precision    recall  f1-score   support

          0       0.96      0.93      0.95      4374
          1       0.96      0.98      0.97      7338

avg / total       0.96      0.96      0.96     11712



Confusion Matrix on Test 

[[ 792  296]
 [ 246 1594]]


Classification Report on Test

             precision    recall  f1-score   support

          0       0.76      0.73      0.75      1088
          1       0.84      0.87      0.85      1840

avg / total       0.81      0.81      0.81      2928

