<a href="https://colab.research.google.com/github/Annamjohn/Airline-Sentiment-Analysis/blob/main/Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import the dependencies
import numpy as np
import pandas as pd
import re #pattern matching, search through data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Data Processing

In [4]:
#loading data from csv file to pd DF

airline_data=pd.read_csv('/content/Airline-Sentiment.csv', encoding= 'ISO-8859-1')

In [5]:
#no. of rows and columns
airline_data.shape

(8422, 20)

In [6]:
airline_data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [7]:
airline_data.isnull().sum() # sum of no. of null vals in each column



_unit_id                           0
_golden                            0
_unit_state                        0
_trusted_judgments                 0
_last_judgment_at                 19
airline_sentiment                  0
airline_sentiment:confidence       0
negativereason                  3687
negativereason:confidence       2835
airline                            0
airline_sentiment_gold          8403
name                               0
negativereason_gold             8407
retweet_count                      0
text                               0
tweet_coord                     7775
tweet_created                      0
tweet_id                           1
tweet_location                  2607
user_timezone                   2656
dtype: int64

In [8]:
# twitter_data['content'] = twitter_data['content'].fillna('')
#twitter_data.isnull().sum()

In [9]:
airline_data['airline_sentiment'].value_counts()

airline_sentiment
negative    4735
neutral     2079
positive    1608
Name: count, dtype: int64

**Stemming**

In [10]:
port_stem=PorterStemmer()

In [11]:
def stemming(content):

    # Remove unwanted patterns
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # ^ =>not i.e. replace all non-alphabetic characters in content with a space
  stemmed_content= stemmed_content.lower()
  stemmed_content= stemmed_content.split() # Split the string into a list of words based on whitespace
  stemmed_content= [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
  stemmed_content= ' '.join(stemmed_content) # Joins the list of stemmed words back into a single string for each tweet, with words separated by spaces.

  return stemmed_content

In [12]:
airline_data['stemmed_content'] = airline_data['text'].apply(stemming)

In [13]:
airline_data= airline_data[['stemmed_content','airline_sentiment']]
airline_data.head()

Unnamed: 0,stemmed_content,airline_sentiment
0,virginamerica dhepburn said,neutral
1,virginamerica plu ad commerci experi tacki,positive
2,virginamerica today must mean need take anoth ...,neutral
3,virginamerica realli aggress blast obnoxi ente...,negative
4,virginamerica realli big bad thing,negative


In [14]:
print(airline_data['stemmed_content'])

0                             virginamerica dhepburn said
1              virginamerica plu ad commerci experi tacki
2       virginamerica today must mean need take anoth ...
3       virginamerica realli aggress blast obnoxi ente...
4                      virginamerica realli big bad thing
                              ...                        
8417    jetblu ye minut spare fyi employe amaz keep go...
8418    jetblu rqstd upgrad mint lax told use point tu...
8419                             jetblu go get wifi plane
8420    jetblu oh yeah great flight mexico wonder crew...
8421                      jetblu get help hotel book issu
Name: stemmed_content, Length: 8422, dtype: object


In [15]:
print(airline_data['airline_sentiment'])

0        neutral
1       positive
2        neutral
3       negative
4       negative
          ...   
8417    positive
8418    negative
8419     neutral
8420    positive
8421     neutral
Name: airline_sentiment, Length: 8422, dtype: object


In [16]:
# seperate data and label

X= airline_data['stemmed_content'].values
Y=airline_data['airline_sentiment'].values

In [17]:
print(X) # 1st 3 and last 3

['virginamerica dhepburn said'
 'virginamerica plu ad commerci experi tacki'
 'virginamerica today must mean need take anoth trip' ...
 'jetblu go get wifi plane'
 'jetblu oh yeah great flight mexico wonder crew thank'
 'jetblu get help hotel book issu']


In [18]:
print(Y)

['neutral' 'positive' 'neutral' ... 'neutral' 'positive' 'neutral']


In [19]:
#Split dataset into train and test set
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.1, stratify= Y, random_state=42)

In [20]:
print(X.shape, X_train.shape, X_test.shape)

(8422,) (7579,) (843,)


In [21]:
print(X_train)

['southwestair thank'
 'virginamerica done thank quick respons appar faster sit hold'
 'line may extra long secur slow seem employe quick blame custom arriv hour ahead jetblu'
 ...
 'unit beat storm town wait issu polici late flight nw issu storm warn alreadi'
 'southwestair repeat polici basic repeat fuck custom shop new airlin asap'
 'southwestair joke return flight delay row new record guy']


In [22]:
print(X_test)

['virginamerica wtf happen pdx late flight march one way sfo'
 'unit bummer thank quick respons'
 'unit unhappi new mileag rule main reason keep fli unit take busi elsewher'
 'unit ticket counter master smf help'
 'jetblu chang made son pass print conf mine print express disappt w jetblu'
 'jetblu dang pandora look like may get push back like'
 'southwestair lost luggag birthday wish find luggag'
 'unit one time thing either shock pattern repeat neglect disrespect'
 'unit followback dm plz' 'unit wifi flight ua sf ord realli wow'
 'unit dmed detail'
 'unit compens delay caus fact enough flight attend http co apr qan'
 'virginamerica free drink flight back free promo'
 'southwestair today go day feel thank support throughout come home nashvil octavia'
 'unit miss incom time sat jetbridg driver miss min connect time flight'
 'southwestair link site display delay cancel flightat flight dallaslovefield'
 'jetblu thank person custom servic cannedtweet autorespons'
 'jetblu opal dragon book 

In [23]:
#Convert textual data to numerical data
vectorizer=TfidfVectorizer() # assigns some importance to each word #checks if each word corresponds to +ve or -ve tweets

X_train=vectorizer.fit_transform(X_train) # understands vocab assigns score to each word
X_test=vectorizer.transform(X_test) # ensures test data is transformed using the same vocabulary and IDF values as training data. Ensures consistency and prevents overfitting.

In [24]:
print(X_train)

  (0, 6571)	0.8123629457954327
  (0, 6186)	0.5831521622171755
  (1, 3001)	0.2739772530562413
  (1, 6037)	0.3059161686933023
  (1, 2238)	0.46358509155175537
  (1, 307)	0.407358960746806
  (1, 5591)	0.31467620589432976
  (1, 5372)	0.38572511184708597
  (1, 1815)	0.3406619914464617
  (1, 7178)	0.2264225418601582
  (1, 6571)	0.1880865922542553
  (2, 3459)	0.11454228816682106
  (2, 112)	0.33596692602115363
  (2, 3061)	0.1669219105388091
  (2, 349)	0.23414964722711681
  (2, 1477)	0.17046974197565853
  (2, 685)	0.3077071141283139
  (2, 2011)	0.25707858920737714
  (2, 5892)	0.2556349051463472
  (2, 6087)	0.3126042030041337
  (2, 5888)	0.30544875146668016
  (2, 3937)	0.23638507244666532
  (2, 2176)	0.2795694338507827
  (2, 4121)	0.26599541238290725
  (2, 3865)	0.24496622625233752
  :	:
  (7576, 5132)	0.23614747765871866
  (7576, 3379)	0.38161210667373113
  (7576, 7241)	0.15746027118654687
  (7576, 190)	0.20752410998154083
  (7576, 2360)	0.08951374641020052
  (7576, 6981)	0.06788614091983172
  (

In [25]:
print(X_test)

  (0, 7489)	0.39714770541776206
  (0, 7282)	0.2773968128501087
  (0, 7178)	0.2221375770433396
  (0, 5935)	0.31443345335564127
  (0, 4980)	0.43319290120586273
  (0, 4773)	0.2514665636110723
  (0, 4086)	0.40786662283177183
  (0, 3750)	0.2742854372096448
  (0, 2861)	0.32491289476254454
  (0, 2360)	0.13807104707431433
  (1, 6981)	0.14479002380502665
  (1, 6571)	0.2551554451052444
  (1, 5591)	0.4268850129968832
  (1, 5372)	0.5232688913230429
  (1, 868)	0.6766759940266011
  (2, 6981)	0.16698699765535444
  (2, 6974)	0.34193998651286983
  (2, 6484)	0.22068046970377275
  (2, 5737)	0.34541403990686026
  (2, 5460)	0.28133724728606524
  (2, 4513)	0.2333338338588893
  (2, 4235)	0.3229324844313664
  (2, 4051)	0.3744335259415862
  (2, 3598)	0.2602326039751337
  (2, 2357)	0.18455113456468009
  :	:
  (840, 6186)	0.08036107824603037
  (840, 5888)	0.24744115304930542
  (840, 5696)	0.2285152314538015
  (840, 5612)	0.2903832596035264
  (840, 3109)	0.2532377168607934
  (840, 2952)	0.30484498649124264
  (840

In [30]:
# Train ML model- Logistic Regression

model=LogisticRegression(multi_class='ovr',max_iter=1000)

In [31]:
model.fit(X_train,Y_train)

In [32]:
#Model Evaluation on training data

X_train_pred= model.predict(X_train)
training_data_accuracy= accuracy_score(Y_train, X_train_pred)

In [33]:
print('Accuracy of training data:' ,training_data_accuracy)

Accuracy of training data: 0.8307164533579628


In [34]:
#Model Evaluation on test data

X_test_pred= model.predict(X_test)
test_data_accuracy= accuracy_score(Y_test, X_test_pred)

In [None]:
print('Accuracy of testing data:' ,test_data_accuracy)


Accuracy of testing data: 0.7984972677595629


# Save the Trained Model

In [36]:
import pickle

In [37]:
filename= 'trained_model.sav'
pickle.dump(model, open(filename, 'wb')) #wb-> writing in binary format

In [38]:
#Using saved model for future predictions
loaded_model= pickle.load(open('/content/trained_model.sav', 'rb'))

In [46]:
X_new=X_test[8]
print(Y_test[8])

prediction= loaded_model.predict(X_new)
print(prediction)



neutral
['neutral']
