#IMPORT ALL LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# LOAD THE DATASET

In [4]:
path = '/content/drive/MyDrive/#3/Tweets.csv'
dataset = pd.read_csv(path)

In [5]:
print(dataset.shape)

(14640, 15)


In [6]:
print(dataset.columns)

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')


In [7]:
print(dataset.head)

<bound method NDFrame.head of                  tweet_id  ...               user_timezone
0      570306133677760513  ...  Eastern Time (US & Canada)
1      570301130888122368  ...  Pacific Time (US & Canada)
2      570301083672813571  ...  Central Time (US & Canada)
3      570301031407624196  ...  Pacific Time (US & Canada)
4      570300817074462722  ...  Pacific Time (US & Canada)
...                   ...  ...                         ...
14635  569587686496825344  ...                         NaN
14636  569587371693355008  ...                         NaN
14637  569587242672398336  ...                         NaN
14638  569587188687634433  ...  Eastern Time (US & Canada)
14639  569587140490866689  ...                         NaN

[14640 rows x 15 columns]>


In [8]:
tweets = dataset.drop(['tweet_id', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],axis=1)

In [9]:
print(tweets.shape)

(14640, 2)


In [10]:
print(tweets.columns)

Index(['airline_sentiment', 'text'], dtype='object')


In [11]:
print(tweets[:5])

  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


In [12]:
np.unique(tweets['airline_sentiment'])

array(['negative', 'neutral', 'positive'], dtype=object)

#LABELS PRE-PROCESSING

Handling categorical data

In [13]:
labels = tweets['airline_sentiment']
for i in labels:
  if i=='negative':
    i=0
  elif i=='neutral':
    i=1
  else:
    i=2

#TEXT PRE-PROCESSING

Converting to lower case

In [14]:
for i in range(len(tweets['text'])):
  tweets['text'][i] = tweets['text'][i].lower()

Removel of Mention - @

In [15]:
def remove_mentions(word):       
    result = re.sub(r"@\S+", "", word)       
    return result

for i in range(len(tweets['text'])):
    tweets['text'][i] = remove_mentions(tweets['text'][i])

Html tag removal

In [16]:
def remove_hyperlink(word):       
    return re.sub(r"http\S+", "", word)

for i in range(len(tweets['text'])):
  tweets['text'][i] = remove_hyperlink(tweets['text'][i])

Removal of Special Characters

In [17]:
def remove_special_characters(word):       
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))    
    return result

for i in range(len(tweets['text'])):
  tweets['text'][i] = remove_special_characters(tweets['text'][i])

Removal of Puntuation

In [18]:
for i in range(len(tweets['text'])):
  tweets['text'][i] = "".join([char.lower() for char in tweets['text'][i] if char not in string.punctuation])

Remove any extra white spaces

In [19]:
for i in range(len(tweets['text'])):
  tweets['text'][i] = re.sub('\s+', ' ', tweets['text'][i]).strip()

Removal of Stopword

In [20]:
ENGLISH_STOP_WORDS = set(stopwords.words('english'))
def remove_stop_words(words):
    mylist = words.split(" ")       
    result = " ".join([i for i in mylist if i not in ENGLISH_STOP_WORDS])     
    return result

for i in range(len(tweets['text'])):
  tweets['text'][i] = remove_stop_words(tweets['text'][i])

Remove the numbers

In [21]:
for i in range(len(tweets['text'])):
  tweets['text'][i] = re.sub(r'\d+', '', tweets['text'][i])

Tokenization

In [22]:
tweet_text_list = []
for tweet in tweets['text']:
    # print(word_tokenize(tweet))
    tweet_text_list.append(word_tokenize(tweet))


Lemmatize or stemming

In [23]:
## using stemming here
stemmer = PorterStemmer()
def stem_words(text):       
    return "".join([stemmer.stem(word) for word in text])

for i in range(len(tweets['text'])):
  tweets['text'][i] = stem_words(tweets['text'][i])

Print first 5 rows of data after pre-processing

In [24]:
print(tweets['text'][:5])

0                                                 said
1        plus youve added commercials experience tacky
2         didnt today must mean need take another trip
3    really aggressive blast obnoxious entertainmen...
4                                 really big bad thing
Name: text, dtype: object


# VECTORIZATION

Using CountVectorizer

In [25]:
cvector = CountVectorizer( min_df=2,max_features=100000)
cvector.fit(tweets['text'])
tweets_processed = cvector.transform(tweets['text']).toarray()

In [26]:
X1_train, X1_test, y1_train, y1_test = train_test_split(tweets_processed, labels, test_size=0.2, random_state=30)

Using TfidfVectorizer

In [27]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

tf_vector = get_feature_vector(np.array(tweets['text']).ravel())
X2 = tf_vector.transform(np.array(tweets['text']).ravel())
y2 = np.array(labels).ravel()

In [28]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=30)

# FITTING THE MODELS

Naive Bayes Models

1. For count vectorization
2. For tf-idf

In [29]:
NB_model1 = MultinomialNB()
NB_model1.fit(X1_train, y1_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
NB_model2 = MultinomialNB()
NB_model2.fit(X2_train, y2_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Logistic Regression Models

1. For count vectorization
2. For tf-idf

In [31]:
LR_model1 = LogisticRegression(max_iter=1000, verbose=1, n_jobs=-1)
LR_model1.fit(X1_train, y1_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   47.6s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)

In [32]:
LR_model2 = LogisticRegression(max_iter=1000, verbose=3, n_jobs=-1)
LR_model2.fit(X2_train, y2_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.8s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=3,
                   warm_start=False)

# EVALUATING THE MODELS

In [33]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "negative"
    elif sentiment == 1:
        return "neutral"
    else:
        return "positive"

Naive Bayes Models

In [34]:
y1_predict_nb = NB_model1.predict(X1_test)
print(y1_predict_nb)
print(accuracy_score(y1_test, y1_predict_nb))

['negative' 'negative' 'positive' ... 'negative' 'negative' 'negative']
0.7721994535519126


In [35]:
y2_predict_nb = NB_model2.predict(X2_test)
print(y2_predict_nb)
print(accuracy_score(y2_test, y2_predict_nb))

['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']
0.6885245901639344


Logistic Regression Models

In [36]:
y1_predict_lr = LR_model1.predict(X1_test)
print(y1_predict_lr)
print(accuracy_score(y1_test, y1_predict_lr))

['negative' 'negative' 'positive' ... 'negative' 'neutral' 'neutral']
0.7885928961748634


In [37]:
y2_predict_lr = LR_model2.predict(X2_test)
print(y2_predict_lr)
print(accuracy_score(y2_test, y2_predict_lr))

['negative' 'negative' 'positive' ... 'negative' 'negative' 'neutral']
0.7824453551912568


In [39]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(LR_model1, open(filename, 'wb'))