In [240]:
# class sentiment:
#     def __init__(self):
#         import pandas as pd
#         self.pd=pd
#         import numpy as np
#         self.np=np
#         import seaborn as sns
#         self.sns=sns
#         import matplotlib.pyplot as plt
#         self.plt=plt
#         import pipes
#         from pipeline import Pipeline
#         self.Pipeline=Pipeline

        

<font size=5>**Steps to Follow**</font>
1. Loading and Exploring Data
2. Text Cleaning
3. Data Preparation
    1. Label Encoding
    2. Split Data
    3. Feature Engineering using TF-IDF
4. Model Building
    1. Naive Bayes
    2. Logistic Regression
    3. Model Building Summary
5. Final Sentiment Analysis Pipeline

In [241]:
import pandas as pd #for handling data
import numpy as np #for numerical computing
import seaborn as sns #ploting
import matplotlib.pyplot as plt #ploting

import re #for text data cleaning/Library for pattern matching

# for NLP related tasks
import spacy 
nlp=spacy.load("en_core_web_sm")


In [242]:
df=pd.read_csv("tweets 2.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [243]:
df.text.sample(5)

316      @virginamerica the manage itinerary section of...
23       @VirginAmerica will you be making BOS&gt;LAS n...
9629     @USAirways on a happy note our 719 crew is won...
8599                             @JetBlue I do follow you!
14239    @AmericanAir … Been trying to book a whole new...
Name: text, dtype: object

In [244]:
df["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [245]:
# class distribution in percentage
df["airline_sentiment"].value_counts(normalize=True)*100

airline_sentiment
negative    62.691257
neutral     21.168033
positive    16.140710
Name: proportion, dtype: float64

## Text Cleaning

In [246]:
def text_claner(text):
     #remove user mentions
     text=re.sub(r"@[A-Za-z0-9]+","",text)
      # remove hashtags
     text=re.sub(r'#[A-Za-z0-9]+','',text)
     #remove links
     text=re.sub(r'http\S+','',text)
       #convering text to lower case
     text=text.lower()  
     #fetch only words
     text=re.sub(r'[^a-z]+'," ",text)
     # removing extra spaces
     text=re.sub(r'[\s]+',' ',text)
     # creating doc object
     doc=nlp(text)
    # remove stopwords and lemmatize the text
     tokens=[token.lemma_ for token in doc if(token.is_stop==False)]

    #  tokens=[tokens.lemma_ for token in doc if(token.is_stop==False)]
      #join tokens by space
     return " ".join(tokens) 


In [247]:
# perform text cleaning
df["cleaned_text"]=df["text"].apply(text_claner)
df.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,cleaned_text
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),say
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),plus ve add commercial experience tacky


In [248]:
# save cleaned text and labels to a variable
text=df["cleaned_text"].values

labels=df["airline_sentiment"].values


In [249]:
text[:5]

array(['  say', '  plus ve add commercial experience tacky',
       '  didn t today mean need trip',
       '  s aggressive blast obnoxious entertainment guest face amp little recourse',
       '  s big bad thing'], dtype=object)

In [250]:
print(labels[:5])

['neutral' 'positive' 'neutral' 'negative' 'negative']


# Data Preparation

## Label Encoding

In [251]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
labels=le.fit_transform(labels)
labels[:10]

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 2])

In [252]:
import numpy as np
# Meaning of each label
le.inverse_transform(np.array([0,1,2])).tolist()

['negative', 'neutral', 'positive']

In [253]:
import sklearn
print(sklearn.__version__)

1.5.2


In [254]:
X=text
y=labels

## Split Data

In [255]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X,y,stratify=y,test_size=0.2,random_state=0)

In [256]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(11712,) (11712,)
(2928,) (2928,)


## Feature Engineering using TF-IDF

In [257]:

from sklearn.feature_extraction.text import TfidfVectorizer
# initialize TFIDF
tfidf=TfidfVectorizer(max_features=1000)

In [258]:
# Fitting Vectorizer on Train set
word_vec=tfidf.fit(X_train)

In [259]:
print(word_vec)

TfidfVectorizer(max_features=1000)


In [260]:
print(word_vec.vocabulary_)

{'today': 892, 'bit': 95, 'bag': 79, 'issue': 469, 'clear': 155, 'thank': 879, 'check': 143, 'hour': 439, 'gate': 392, 'hold': 430, 'minute': 580, 'people': 651, 'flight': 350, 'go': 398, 'experience': 311, 'terrible': 876, 'awful': 76, 'customer': 215, 'service': 789, 'lose': 528, 'win': 973, 'free': 373, 'kid': 484, 'leave': 501, 'onboard': 623, 'ua': 919, 'ord': 629, 'sfo': 791, 'row': 758, 'help': 423, 'offer': 616, 'sleep': 809, 'cancel': 123, 'flightle': 355, 'fucking': 383, 'hotel': 438, 'like': 511, 'suppose': 858, 'provide': 702, 'love': 532, 'know': 487, 'policy': 677, 'luggage': 538, 'lounge': 531, 'attendant': 62, 'terminal': 875, 'give': 396, 'avail': 70, 'hi': 426, 'booking': 102, 'problem': 695, 'number': 613, 'sure': 859, 'call': 121, 'rebooke': 720, 'find': 342, 'taxi': 872, 'stand': 831, 'las': 495, 'vegas': 943, 'say': 773, 'time': 887, 'rep': 734, 'tell': 874, 'denver': 237, 'nope': 607, 'currently': 213, 'sit': 804, 'friend': 377, 'have': 417, 'flightlation': 354, 

In [261]:
# create TF-IDF vectors for Train Set
X_train=word_vec.transform(X_train)

In [262]:
list(X_train)[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (1, 1000)>

In [263]:
# create TF-IDF vectors for test Set
X_test=word_vec.transform(X_test)

# Model Building

## Logistic Regression

In [264]:
from sklearn.linear_model import LogisticRegression
#training model
lr_model=LogisticRegression().fit(X_train,y_train)
lr_model.score(X_test,y_test)

0.7715163934426229

In [265]:
import pickle
with open("lr_model.pkl","wb") as f:
    pickle.dump(lr_model,f)

In [266]:
with open("lr_model.pkl","rb") as f:
    lr_model_downloaded=pickle.load(f)


In [267]:
from sklearn.metrics import f1_score
y_pred=lr_model_downloaded.predict(X_test)
print(y_pred)
print("f1_score",f1_score(y_test,y_pred,average="weighted"))

[0 0 2 ... 0 0 0]
f1_score 0.760655039079428


In [268]:
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(),X_test,y_test)

array([0.73720137, 0.74914676, 0.7116041 , 0.72136752, 0.74700855])

# Final Sentiment Analysis Pipeline

In [269]:
def sentiment_analyser(tweet):
    #cleaned tweet
    cleaned_tweet=text_claner(tweet)
    # print(cleaned_tweet)
     # Feature Engineering , tfidf
    tweet_vector=word_vec.transform([cleaned_tweet])
    # print(tweet_vector)
    #predicting sentiment
    label=lr_model_downloaded.predict(tweet_vector)
    # print(label)
    #showing predicted text
    return le.inverse_transform(np.array(label))[0]

<font size=4>**Sample Tweet:**</font>
<p>@USAirways flt 419. 2+ hrs Late Flight, baggage + 1 more hr. Now I see they delivered my suitcase wet inside &amp; out. #NotHappy</p>

In [270]:
sentiment_analyser("@USAirways flt 419. 2+ hrs Late Flight, baggage + 1 more hr. Now I see they delivered my suitcase wet inside &amp; out.")

'negative'