## **AIRLINE SENTIMENT ANALYSIS**

##### MODEL BUILDING USING LOGISTIC REGRESSION

In [79]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [81]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avantika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### IMPORT THE DATASET

In [84]:
# Importing the data as pd dataframe
tweets = pd.read_csv('../../Airline-Sentiment_data.csv' , encoding= 'ISO-8859-1' )
tweets.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [86]:
tweets.shape

(14640, 20)

#### DATA CLEANING

In [89]:
tweets.isnull().sum()

_unit_id                            0
_golden                             0
_unit_state                         0
_trusted_judgments                  0
_last_judgment_at                  56
airline_sentiment                   0
airline_sentiment:confidence        0
negativereason                   5462
negativereason:confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_id                            0
tweet_location                   4733
user_timezone                    4820
dtype: int64

In [91]:
tweets.drop(['_golden','_unit_state','_trusted_judgments','negativereason','_last_judgment_at','airline_sentiment:confidence','negativereason:confidence','airline_sentiment_gold','negativereason_gold','retweet_count','tweet_coord','tweet_created','tweet_location','user_timezone'] , axis =1 , inplace = True)
tweets.rename(columns={'airline_sentiment':'Target'} , inplace = True)
tweets.head()

Unnamed: 0,_unit_id,Target,airline,name,text,tweet_id
0,681448150,neutral,Virgin America,cairdin,@VirginAmerica What @dhepburn said.,5.70306e+17
1,681448153,positive,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...,5.70301e+17
2,681448156,neutral,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,5.70301e+17
3,681448158,negative,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...,5.70301e+17
4,681448159,negative,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...,5.70301e+17


In [93]:
tweets.isnull().sum()

_unit_id    0
Target      0
airline     0
name        0
text        0
tweet_id    0
dtype: int64

In [95]:
tweets['Target'].value_counts()

Target
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [97]:
tweets.replace({'Target':{'neutral':1 , 'positive':1 , 'negative': 0}} , inplace=True)
tweets.Target.value_counts()

  tweets.replace({'Target':{'neutral':1 , 'positive':1 , 'negative': 0}} , inplace=True)


Target
0    9178
1    5462
Name: count, dtype: int64

#### APPLYING STEMMING TO THE DATASET

In [100]:
port_stem = PorterStemmer()

In [102]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [104]:
tweets['Stemmed_tweet'] = tweets['text'].apply(stemming)

In [106]:
print(tweets['Stemmed_tweet'])

0                              virginamerica dhepburn said
1               virginamerica plu ad commerci experi tacki
2        virginamerica today must mean need take anoth ...
3        virginamerica realli aggress blast obnoxi ente...
4                       virginamerica realli big bad thing
                               ...                        
14635          americanair thank got differ flight chicago
14636    americanair leav minut late flight warn commun...
14637    americanair pleas bring american airlin blackb...
14638    americanair money chang flight answer phone su...
14639    americanair ppl need know mani seat next fligh...
Name: Stemmed_tweet, Length: 14640, dtype: object


In [108]:
print(tweets['Target'])

0        1
1        1
2        1
3        0
4        0
        ..
14635    1
14636    0
14637    1
14638    0
14639    1
Name: Target, Length: 14640, dtype: int64


In [110]:
x = tweets['Stemmed_tweet'].values
y = tweets['Target'].values

In [112]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3 , stratify = y , random_state = 42)
print(x.shape,x_train.shape,x_test.shape)

(14640,) (10248,) (4392,)


In [114]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

#### MAKING PREDICTION USING LOGISTIC REGRESSION

In [117]:
model = LogisticRegression()
model.fit(x_train,y_train)

In [119]:
y_pred = model.predict(x_test)

#### ACCURACY OF THE MODEL


In [122]:
x_test_accuracy = accuracy_score(y_pred,y_test)
print('Accuracy of the model is {}'.format(x_test_accuracy))

Accuracy of the model is 0.8271857923497268


In [124]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test , y_pred)

array([[2480,  273],
       [ 486, 1153]], dtype=int64)

In [126]:
from sklearn.metrics import classification_report
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      2753
           1       0.81      0.70      0.75      1639

    accuracy                           0.83      4392
   macro avg       0.82      0.80      0.81      4392
weighted avg       0.83      0.83      0.82      4392



In [128]:
import pickle
with open('logistic_model','wb') as f:
  pickle.dump(model,f)