In [1]:
import pandas as pd
import re
import tweepy
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
airlines_csv = '../Resources/Airline-Sentiment-2-w-AA.csv'
airlines_df = pd.read_csv(airlines_csv, encoding = 'ISO-8859-1')
airlines_df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/2015 5:24,neutral,1.0,,,Virgin America,,,0,@VirginAmerica What @dhepburn said.,,2/24/2015 11:35,5.7e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/2015 1:53,positive,0.3486,,0.0,Virgin America,,,0,@VirginAmerica plus you've added commercials t...,,2/24/2015 11:15,5.7e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/2015 10:01,neutral,0.6837,,,Virgin America,,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/2015 11:15,5.7e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/2015 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,,0,@VirginAmerica it's really aggressive to blast...,,2/24/2015 11:15,5.7e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/2015 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,,0,@VirginAmerica and it's a really big bad thing...,,2/24/2015 11:14,5.7e+17,,Pacific Time (US & Canada)


In [3]:
features = airlines_df.iloc[:, 14].values
features
labels = airlines_df.iloc[:, 5].values
print(labels)

['neutral' 'positive' 'neutral' ... 'neutral' 'negative' 'neutral']


In [5]:
# cleaned the data by chaning to lowercase

clean_data = []                 
for feature in features:
        item = ' '.join(word.lower() for word in str(feature).split() \
            if not word.startswith('#') and \
            not word.startswith('@') and \
            not word.startswith('http') and \
            not word.startswith('RT'))
        
        if item == "" or item == "RT":
                continue
        clean_data.append(item)
        

In [6]:
# https://www.earthdatascience.org/courses/earth-analytics-python/using-apis-natural-language-processing-twitter/calculate-tweet-word-frequencies-in-python/
# used code from link above to remove all special characters 
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

cleaned_data_no_urls = [remove_url(tweet) for tweet in clean_data]
cleaned_data_no_urls[:10]

['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']

In [7]:
# Encode the labels to numbers
sentiments = ['positive', 'negative', 'neutral']
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(sentiments)
list(le.classes_)
train_labels = le.transform(labels) 

In [8]:
#create pipeline using Multinomial naive_bayes
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [9]:
#set parameters to test
from sklearn.model_selection import GridSearchCV
parameters = {
#number of combined words for tokenization
'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
#remove words above a specified threshold (used in place of stop words)
'vect__max_df': (0.25, 0.5, 0.75, 1.0),
#include idf
'tfidf__use_idf': (True, False),
#smoothing parameter
'clf__alpha': (1e-2, 1e-3, 1),
#change prior probabilities
'clf__fit_prior': (True, False)}

In [10]:
#create gridsearch model
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)

In [11]:
#fit model
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

KeyboardInterrupt: 

In [None]:
print(f'Best Score: {predictor.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, predictor.best_params_[param_name]))

In [None]:
from joblib import dump, load
dump(predictor, 'Models/MultinomialB.joblib') 

In [None]:
gs_clf.predict(['God is love'])[0]]

In [None]:
#create pipeline using Complement naive_bayes
from sklearn.naive_bayes import ComplementNB
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', ComplementNB()),
])

In [None]:
#set parameters to test
parameters = {
#number of combined words for tokenization
'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
#remove words above a specified threshold (used in place of stop words)
'vect__max_df': (0.25, 0.5, 0.75, 1.0),
#include idf
'tfidf__use_idf': (True, False),
#smoothing parameter
'clf__alpha': (1e-2, 1e-3, 1),
#change prior probabilities
'clf__fit_prior': (True, False)}

In [None]:
#create gridsearch model
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)

In [None]:
#fit model
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

In [None]:
print(f'Best Score: {predictor.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, predictor.best_params_[param_name]))

In [None]:
from joblib import dump, load
dump(predictor, 'Models/Complement.joblib') 

In [None]:
#create pipeline using Complement naive_bayes
from sklearn.naive_bayes import BernoulliNB
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB()),
])

In [None]:
#set parameters to test
parameters = {
#number of combined words for tokenization
'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
#remove words above a specified threshold (used in place of stop words)
'vect__max_df': (0.25, 0.5, 0.75, 1.0),
#include idf
'tfidf__use_idf': (True, False),
#smoothing parameter
'clf__alpha': (1e-2, 1e-3, 1),
#change prior probabilities
'clf__fit_prior': (True, False),
#change values to boolean
'clf__binarize': (0,1,2)}

In [None]:
#create gridsearch model
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False)

In [None]:
#fit model
predictor = gs_clf.fit(cleaned_data_no_urls, train_labels)

In [None]:
print(f'Best Score: {predictor.best_score_}')                              

for param_name in sorted(parameters.keys()):
    print((param_name, predictor.best_params_[param_name]))

In [None]:
from joblib import dump, load
dump(predictor, 'Models/Bernoulli.joblib') 