In [2]:
import pandas as pd
import numpy as np
import csv

# 1. Web Scraping

In [3]:
# blabla

# 2. Word Analysis

In [80]:
review = pd.read_csv("Airlines.csv")
review.head(5)

Unnamed: 0,Airline,Review
0,Delta_Airline,No guarantee of seating if booked on Expedia. ...
1,Delta_Airline,3 days to get to London. Delta are still way b...
2,Delta_Airline,Our trip was from Phoenix to Reno with transit...
3,Delta_Airline,"OUR FLIGHT FROM ST. LOUIS, MO TO ATLANTA, GA. ..."
4,Delta_Airline,My flight to JFK scheduled 2013 was canceled. ...


We scraped reviews from pages that contain discussions for certain airlines. Intuitively, we consider that every review mentions the respective airline name once. Therefore, we attach the respective airline name to each review to make sure we count 1 mentioning of airline, even though the specific review did not mention the airline name.

In [5]:
# attach the airline name to reviews scraped from the airline forum page
data = []
review1 = pd.DataFrame(data)

appended = []
for i in range(len(review)):
    appended.append(review.Airline[i]+" "+review.Review[i])
    
review_concat = pd.Series(appended)
review1.insert(loc = 0,column='review_concat', value=review_concat)
review1.head(5)  # successfully attached

Unnamed: 0,review_concat
0,Delta_Airline No guarantee of seating if booke...
1,Delta_Airline 3 days to get to London. Delta a...
2,Delta_Airline Our trip was from Phoenix to Ren...
3,"Delta_Airline OUR FLIGHT FROM ST. LOUIS, MO TO..."
4,Delta_Airline My flight to JFK scheduled 2013 ...


## 2.1 Tokenization and replacement

In [6]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [7]:
# users' mentioning of delta can be considered as mentioning of delta airline.
# import a replacement reference created by us to catch those "short names"
replace = pd.read_csv("airline_replace.csv")
replace

Unnamed: 0,detect,replacement
0,delta,delta_airline
1,united,united_airline
2,spirit,spirit_airline
3,frontier,frontier_airline
4,emirates,emirates_airline
5,southwest,southwest_airline


In [8]:
# show all airline full names
airline_names = review.Airline.unique()
names_lower = []
for i in range(len(airline_names)):
    names_lower.append(airline_names[i].lower())
    
names_lower

['delta_airline',
 'american_airline',
 'united_airline',
 'spirit_airline',
 'air_canada',
 'ryanair',
 'frontier_airline',
 'emirates_airline',
 'southwest_airline',
 'british_airway']

### 2.1.1 replace short names with full names (with underscore but not space)

In [9]:
# tokenize each word/symbols
# replace tokens according to the replacement tabel, except for 'American_airline', 'Air_canada', and 'british_airway',
# for the mentionings of the three airlines may have other meanings
tokenized_replace1 = []
for i in review1["review_concat"]:
    text = word_tokenize(i)
    text1 = []
    for i in range(len(text)):
        text1.append(text[i].lower())
    for j in range(len(text1)):
        if any(text1[j]==replace['detect']):
            airline_name = replace.loc[replace.detect==text1[j],'replacement'].iloc[0]  # replace
            text1[j] = airline_name
    tokenized_replace1.append(text1)

### 2.1.2 replace blurry airline names (Air Canada, American Airlines, and British Airways) with full names  (with underscore but not space)

Since we cannot simply equate the mentioning of "Canada" to the mentioning of "Air Canada", we need to first detect instances of "air" in tokens and check if the next token is "canada". If so, we confirm that the user mentioned "Air Canada" once and replace the mentioning to "air_canada" for later calculation. <br>
Mentionings of "American Airlines" and "British Airways" are processed with the same method due to the same reason. 

In [10]:
tokenized_replace2 = []
for i in range(len(tokenized_replace1)):
    text1 = tokenized_replace1[i]
    for j in range(len(text1)):
        if text1[j].lower()=="air":
            if j < len(text1)-1 and text1[j+1]=="canada":
                text1[j] = "air_canada"
        if text1[j].lower()=="british":
            if j < len(text1)-1 and text1[j+1]=="airways":
                text1[j] = "british_airway"
            elif j < len(text1)-1 and text1[j+1]=="airway":
                text1[j] = "british_airway"
        if text1[j].lower()=="american":
            if j < len(text1)-1 and text1[j+1]=="airlines":
                text1[j] = "american_airline"
            elif j < len(text1)-1 and text1[j+1]=="airline":
                text1[j] = "american_airline" 
    tokenized_replace2.append(text1)

### 2.1.3 Remove duplicate mentions in one review

In [11]:
def remove_duplicate(mylist):  # define a function to help drop replicated brand names
    mylist = list(dict.fromkeys(mylist))
    return mylist

In [12]:
# create a lowercased airline name list for matching
data1 = []
airline_names_df = pd.DataFrame(data1)    
an_series = pd.Series(names_lower)
airline_names_df.insert(loc = 0,column='airline_names', value=an_series)
airline_names_df 

Unnamed: 0,airline_names
0,delta_airline
1,american_airline
2,united_airline
3,spirit_airline
4,air_canada
5,ryanair
6,frontier_airline
7,emirates_airline
8,southwest_airline
9,british_airway


In [13]:
# make sure to have only one airline mentionings in one post
airline_mentioned = []
for sent in tokenized_replace2:
    airline_post=[]
    for m in range(len(sent)):
        if any(sent[m]==airline_names_df["airline_names"]):
            airline_post.append(sent[m])
    if airline_post != []: # only append non-empty lists
        airline_post = remove_duplicate(airline_post)
        airline_mentioned.append(airline_post)

## 2.2 Count airline name mentionings

In [14]:
# count airline mentionings into a dictionary
airline_count_dict = {}
for i in names_lower:
    count = 0
    for j in airline_mentioned:
        if i in j:
            count +=1
    airline_count_dict[i] = count
    
airline_count_df = pd.DataFrame(list(airline_count_dict.items()),columns = ['Airline','Count'])
airline_count_df

Unnamed: 0,Airline,Count
0,delta_airline,2234
1,american_airline,2155
2,united_airline,2245
3,spirit_airline,2053
4,air_canada,2023
5,ryanair,2086
6,frontier_airline,2022
7,emirates_airline,2085
8,southwest_airline,2173
9,british_airway,2035


## 2.3 Word frequency

In [15]:
from __future__ import division
import nltk, re
nltk.download('averaged_perceptron_tagger')
from nltk import FreqDist
from nltk import word_tokenize 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import brown 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\timwy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
# tag tokenized original reviews with part-of-speech tags

total=[]
for i in review["Review"]:
    text = word_tokenize(i)
    total.append(text)
    
tagged = nltk.pos_tag_sents(total)

In [17]:
# count the most frequently mentioned nouns, adjectives, and adverbs
# record the words of selected part-of-speech type

def containsNumber(value):  # remove words that contain numbers 
    for character in value:
        if character.isdigit():
            return True
    return False

words_recorder = []

for i in range(len(tagged)): 
    tag_select = tagged[i]
    for j in range(len(tag_select)):
        label = tag_select[j][1]
        word = tag_select[j][0]
        if label in ['NN', 'JJ', 'JJR', 'JJS', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS']:
            if containsNumber(word) == False:
                words_recorder.append(word.lower())
        
words_recorder

['guarantee',
 'expedia',
 'daughter',
 'cousins',
 'grandmother',
 'away',
 'not',
 'even',
 'were',
 'less',
 'nice',
 'delta',
 'only',
 'lot',
 'years',
 'airline',
 'days',
 'london',
 'delta',
 'still',
 'way',
 'bar',
 'service',
 'ceo',
 'so',
 'many',
 'aspects',
 'business',
 'however',
 'customer',
 'care',
 'worst',
 'industry',
 'second',
 'day',
 'jfk',
 'atc',
 'delta',
 "n't",
 'comp',
 'help',
 'line',
 'once',
 'again',
 'times',
 'remember',
 'atlanta',
 'few',
 'months',
 'ago',
 'once',
 'phone',
 'poorly',
 'trained',
 'telephonists',
 'idea',
 'stress',
 'indeed',
 'upset',
 'one',
 'trip',
 'phoenix',
 'transit',
 'salt',
 'lake',
 'city',
 'originally',
 'fly',
 'slc',
 'next',
 'flight',
 'delta',
 'flight',
 'slc',
 'arrived',
 'due',
 'technical',
 'issues',
 'not',
 'flight',
 'schedule',
 'next',
 'flight',
 'hours',
 'our',
 'flight',
 'from',
 'st.',
 'louis',
 'mo',
 'to',
 'atlanta',
 'ga.',
 'to',
 'celebrate',
 'our',
 'anniversary',
 'husband',
 'fl

In [18]:
# the 500 most frequently mentioned words
from collections import Counter
c = Counter(words_recorder)
c_500 = c.most_common(500)

common_words = []
for i in c_500:
    word = i[0]
    common_words.append(word)
    
common_words_df = pd.DataFrame(common_words,columns = ['Word'])
common_words_df.head(10)

Unnamed: 0,Word
0,flight
1,not
2,n't
3,time
4,service
5,very
6,airline
7,airlines
8,so
9,plane


In [19]:
# import os    
# common_words_df.to_csv('freq.csv')  

In [37]:
attri = pd.read_csv("freq.csv")
attri

Unnamed: 0,Category,Attribute
0,comfort,seat
1,comfort,seats
2,comfort,leg
3,comfort,comfortable
4,comfort,room
...,...,...
171,food_beverage,sweets
172,food_beverage,fruits
173,food_beverage,cake
174,food_beverage,cakes


In [40]:
# tokenize each word/symbols, and at the same time replace any detected car model name with brand name
tokenized_attr = []
for i in review["Review"]:
    text = word_tokenize(i)
    for j in range(len(text)):
        if any(text[j]==attri['Attribute']):
            category = attri.loc[attri.Attribute==text[j],'Category'].iloc[0]
            text[j] = category
    tokenized_attr.append(text)

In [41]:
# append lowercased tokens to a list
attr_mentioned = []
for sent in tokenized_attr:
  attr_post=[]
  for m in range(len(sent)):
    if any(sent[m].lower()==attri['Category']):
      attr_post.append(sent[m].lower())
  if attr_post != []:# only append non-empty lists
    attr_post = remove_duplicate(attr_post)
    attr_mentioned.append(attr_post)
          
attr_mentioned

[['comfort'],
 ['check-in_boarding', 'customer_service', 'monetary_values'],
 ['check-in_boarding'],
 ['check-in_boarding', 'monetary_values'],
 ['comfort', 'check-in_boarding', 'monetary_values'],
 ['cleanliness', 'customer_service', 'check-in_boarding', 'comfort'],
 ['customer_service'],
 ['check-in_boarding', 'customer_service'],
 ['check-in_boarding', 'monetary_values', 'customer_service'],
 ['monetary_values', 'check-in_boarding'],
 ['check-in_boarding',
  'food_beverage',
  'in-flight_entertainment',
  'comfort',
  'monetary_values'],
 ['monetary_values', 'comfort'],
 ['check-in_boarding', 'customer_service'],
 ['customer_service', 'monetary_values', 'check-in_boarding'],
 ['customer_service', 'monetary_values', 'comfort', 'check-in_boarding'],
 ['customer_service', 'check-in_boarding', 'monetary_values'],
 ['customer_service'],
 ['monetary_values', 'customer_service'],
 ['monetary_values', 'check-in_boarding', 'customer_service'],
 ['check-in_boarding', 'customer_service', 'mone

In [42]:
# count attribute mentionings
attrs =  attri['Category'].unique()
attr_count_dict = {}
for i in attrs:
    count = 0
    for j in attr_mentioned:
        if i in j:
            count +=1
    attr_count_dict[i] = count

In [43]:
# sort the counts, display the popularity of attributes
attr_count_df = pd.DataFrame(list(attr_count_dict.items()),columns = ['Attribute Catgory','Count'])
attr_count_df.sort_values(by=["Count"], ascending = False, ignore_index = True) 

Unnamed: 0,Attribute Catgory,Count
0,check-in_boarding,16956
1,customer_service,14411
2,monetary_values,11603
3,comfort,7587
4,food_beverage,4664
5,in-flight_entertainment,1883
6,cleanliness,1042


Get mentionings of brands and attributes without dropping empty lists
We hope to one-on-one match brand mentionings and attribute mentionings of each post, then drop the posts mentioned neither brand nor attribute. So, we first keep the empty lists to make sure the rows(indexes) of brand and attribute mentionings match with each other.

In [45]:
air_mentioned_with_empty = []
for sent in tokenized_replace2:
    air_post = []
    for m in range(len(sent)):
        if any(sent[m].lower()==airline_names_df["airline_names"]):
            air_post.append(sent[m].lower())
        air_post = remove_duplicate(air_post)
    air_mentioned_with_empty.append(air_post)  # append both empty and non-empty lists

air_mentioned_with_empty

[['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline', 'american_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],
 ['delta_airline'],


In [46]:
attr_mentioned_with_empty = []
for sent in tokenized_attr:
    attr_post = []
    for m in range(len(sent)):
        if any(sent[m].lower()==attri['Category']):
            attr_post.append(sent[m].lower())
        attr_post = remove_duplicate(attr_post)
    attr_mentioned_with_empty.append(attr_post)  # append both empty and non-empty lists

attr_mentioned_with_empty

[['comfort'],
 ['check-in_boarding', 'customer_service', 'monetary_values'],
 ['check-in_boarding'],
 ['check-in_boarding', 'monetary_values'],
 ['comfort', 'check-in_boarding', 'monetary_values'],
 ['cleanliness', 'customer_service', 'check-in_boarding', 'comfort'],
 ['customer_service'],
 ['check-in_boarding', 'customer_service'],
 ['check-in_boarding', 'monetary_values', 'customer_service'],
 ['monetary_values', 'check-in_boarding'],
 ['check-in_boarding',
  'food_beverage',
  'in-flight_entertainment',
  'comfort',
  'monetary_values'],
 ['monetary_values', 'comfort'],
 ['check-in_boarding', 'customer_service'],
 ['customer_service', 'monetary_values', 'check-in_boarding'],
 ['customer_service', 'monetary_values', 'comfort', 'check-in_boarding'],
 ['customer_service', 'check-in_boarding', 'monetary_values'],
 ['customer_service'],
 ['monetary_values', 'customer_service'],
 ['monetary_values', 'check-in_boarding', 'customer_service'],
 ['check-in_boarding', 'customer_service', 'mone

In [47]:
len(air_mentioned_with_empty) == len(attr_mentioned_with_empty)

True

In [94]:
#putting brand names and attributes together
data = []
df_brand_attri = pd.DataFrame(data)

In [95]:
brand_mentioned = pd.Series(air_mentioned_with_empty)
attri_mentioned = pd.Series(attr_mentioned_with_empty)
df_brand_attri.insert(loc = 0,column='Airline_Mentioned', value=brand_mentioned)
df_brand_attri.insert(loc = 1, column='Attribute_Mentioned', value=attri_mentioned)
df_brand_attri_null_copy = df_brand_attri.copy()

In [96]:
import numpy as np
df_brand_attri.Airline_Mentioned = df_brand_attri.Airline_Mentioned.apply(lambda Airline_Mentioned: np.nan if len(Airline_Mentioned)==0 
                                                                      else Airline_Mentioned)
df_brand_attri.Attribute_Mentioned = df_brand_attri.Attribute_Mentioned.apply(lambda Attribute_Mentioned: np.nan if len(Attribute_Mentioned)==0 
                                                                      else Attribute_Mentioned)
df_brand_attri

Unnamed: 0,Airline_Mentioned,Attribute_Mentioned
0,[delta_airline],[comfort]
1,[delta_airline],"[check-in_boarding, customer_service, monetary..."
2,[delta_airline],[check-in_boarding]
3,[delta_airline],"[check-in_boarding, monetary_values]"
4,[delta_airline],"[comfort, check-in_boarding, monetary_values]"
...,...,...
19995,[british_airway],"[check-in_boarding, customer_service, comfort,..."
19996,[british_airway],[customer_service]
19997,[british_airway],"[comfort, cleanliness, customer_service, food_..."
19998,[british_airway],[customer_service]


In [97]:
# exclude posts that mentioned neither attributs nor brand names
df_brand_attri_final = df_brand_attri[df_brand_attri['Airline_Mentioned'].notna()]
df_brand_attri_final = df_brand_attri[df_brand_attri['Attribute_Mentioned'].notna()]
df_brand_attri_final

Unnamed: 0,Airline_Mentioned,Attribute_Mentioned
0,[delta_airline],[comfort]
1,[delta_airline],"[check-in_boarding, customer_service, monetary..."
2,[delta_airline],[check-in_boarding]
3,[delta_airline],"[check-in_boarding, monetary_values]"
4,[delta_airline],"[comfort, check-in_boarding, monetary_values]"
...,...,...
19995,[british_airway],"[check-in_boarding, customer_service, comfort,..."
19996,[british_airway],[customer_service]
19997,[british_airway],"[comfort, cleanliness, customer_service, food_..."
19998,[british_airway],[customer_service]


# 3. Sentiment Analysis with VADER

In [99]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [100]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\timwy\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [101]:
review_sent = review

In [102]:
SIA = SentimentIntensityAnalyzer()

review_sent['scores'] = review_sent['Review'].apply(lambda Review: SIA.polarity_scores(Review))

review_sent.head(5)

Unnamed: 0,Airline,Review,scores,compound,sent_type,Airline_Mentioned,Attribute_Mentioned,delta_airline,american_airline,united_airline,spirit_airline,air_canada,ryanair,frontier_airline,emirates_airline,southwest_airline,british_airway
0,Delta_Airline,No guarantee of seating if booked on Expedia. ...,"{'neg': 0.036, 'neu': 0.889, 'pos': 0.074, 'co...",0.323,2,[delta_airline],[comfort],1,0,0,0,0,0,0,0,0,0
1,Delta_Airline,3 days to get to London. Delta are still way b...,"{'neg': 0.125, 'neu': 0.79, 'pos': 0.086, 'com...",-0.6641,0,[delta_airline],"[check-in_boarding, customer_service, monetary...",1,0,0,0,0,0,0,0,0,0
2,Delta_Airline,Our trip was from Phoenix to Reno with transit...,"{'neg': 0.058, 'neu': 0.942, 'pos': 0.0, 'comp...",-0.4767,0,[delta_airline],[check-in_boarding],1,0,0,0,0,0,0,0,0,0
3,Delta_Airline,"OUR FLIGHT FROM ST. LOUIS, MO TO ATLANTA, GA. ...","{'neg': 0.0, 'neu': 0.953, 'pos': 0.047, 'comp...",0.7804,2,[delta_airline],"[check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0
4,Delta_Airline,My flight to JFK scheduled 2013 was canceled. ...,"{'neg': 0.062, 'neu': 0.894, 'pos': 0.044, 'co...",-0.5186,0,[delta_airline],"[comfort, check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0


In [103]:
review_sent['compound']  = review_sent['scores'].apply(lambda score_dict: score_dict['compound'])
review_sent['sent_type'] = review_sent['compound'].apply(lambda c: 2 if c>0 else (1 if c==0 else 0))

# sent_type == 0: negative (compound < 0)
# sent_type == 1: neutral (compound = 0)
# sent_type == 2: positive (compound > 0)

review_sent.head(5)

Unnamed: 0,Airline,Review,scores,compound,sent_type,Airline_Mentioned,Attribute_Mentioned,delta_airline,american_airline,united_airline,spirit_airline,air_canada,ryanair,frontier_airline,emirates_airline,southwest_airline,british_airway
0,Delta_Airline,No guarantee of seating if booked on Expedia. ...,"{'neg': 0.036, 'neu': 0.889, 'pos': 0.074, 'co...",0.323,2,[delta_airline],[comfort],1,0,0,0,0,0,0,0,0,0
1,Delta_Airline,3 days to get to London. Delta are still way b...,"{'neg': 0.125, 'neu': 0.79, 'pos': 0.086, 'com...",-0.6641,0,[delta_airline],"[check-in_boarding, customer_service, monetary...",1,0,0,0,0,0,0,0,0,0
2,Delta_Airline,Our trip was from Phoenix to Reno with transit...,"{'neg': 0.058, 'neu': 0.942, 'pos': 0.0, 'comp...",-0.4767,0,[delta_airline],[check-in_boarding],1,0,0,0,0,0,0,0,0,0
3,Delta_Airline,"OUR FLIGHT FROM ST. LOUIS, MO TO ATLANTA, GA. ...","{'neg': 0.0, 'neu': 0.953, 'pos': 0.047, 'comp...",0.7804,2,[delta_airline],"[check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0
4,Delta_Airline,My flight to JFK scheduled 2013 was canceled. ...,"{'neg': 0.062, 'neu': 0.894, 'pos': 0.044, 'co...",-0.5186,0,[delta_airline],"[comfort, check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0


In [104]:
review_sent["Airline_Mentioned"] = df_brand_attri_null_copy["Airline_Mentioned"]
review_sent["Attribute_Mentioned"] = df_brand_attri_null_copy["Attribute_Mentioned"]

In [105]:
review_sent

Unnamed: 0,Airline,Review,scores,compound,sent_type,Airline_Mentioned,Attribute_Mentioned,delta_airline,american_airline,united_airline,spirit_airline,air_canada,ryanair,frontier_airline,emirates_airline,southwest_airline,british_airway
0,Delta_Airline,No guarantee of seating if booked on Expedia. ...,"{'neg': 0.036, 'neu': 0.889, 'pos': 0.074, 'co...",0.3230,2,[delta_airline],[comfort],1,0,0,0,0,0,0,0,0,0
1,Delta_Airline,3 days to get to London. Delta are still way b...,"{'neg': 0.125, 'neu': 0.79, 'pos': 0.086, 'com...",-0.6641,0,[delta_airline],"[check-in_boarding, customer_service, monetary...",1,0,0,0,0,0,0,0,0,0
2,Delta_Airline,Our trip was from Phoenix to Reno with transit...,"{'neg': 0.058, 'neu': 0.942, 'pos': 0.0, 'comp...",-0.4767,0,[delta_airline],[check-in_boarding],1,0,0,0,0,0,0,0,0,0
3,Delta_Airline,"OUR FLIGHT FROM ST. LOUIS, MO TO ATLANTA, GA. ...","{'neg': 0.0, 'neu': 0.953, 'pos': 0.047, 'comp...",0.7804,2,[delta_airline],"[check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0
4,Delta_Airline,My flight to JFK scheduled 2013 was canceled. ...,"{'neg': 0.062, 'neu': 0.894, 'pos': 0.044, 'co...",-0.5186,0,[delta_airline],"[comfort, check-in_boarding, monetary_values]",1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,British_Airway,LGW-MCO-LGW 11-25th August Economy Class. Got ...,"{'neg': 0.103, 'neu': 0.837, 'pos': 0.06, 'com...",-0.5305,0,[british_airway],"[check-in_boarding, customer_service, comfort,...",0,0,0,0,0,0,0,0,0,1
19996,British_Airway,✅ Verified Review | Flew London Heathrow to C...,"{'neg': 0.108, 'neu': 0.873, 'pos': 0.019, 'co...",-0.7876,0,[british_airway],[customer_service],0,0,0,0,0,0,0,0,0,1
19997,British_Airway,LHR-SFO-LHR. Missed out on the A380 and both l...,"{'neg': 0.127, 'neu': 0.696, 'pos': 0.177, 'co...",0.8539,2,[british_airway],"[comfort, cleanliness, customer_service, food_...",0,0,0,0,0,0,0,0,0,1
19998,British_Airway,✅ Trip Verified | London to Calgary. Thank yo...,"{'neg': 0.0, 'neu': 0.732, 'pos': 0.268, 'comp...",0.8718,2,[british_airway],[customer_service],0,0,0,0,0,0,0,0,0,1


In [106]:
for j in airline_names_df["airline_names"]:
    temp_brand = []
    for i in review_sent["Airline_Mentioned"]:
        if j in i:
            temp_brand.append(1)
        else:
            temp_brand.append(0)
    temp_brand = pd.Series(temp_brand)
    review_sent[j] = temp_brand

top_attri = ["comfort","in-flight_entertainment","customer_service",
             "monetary_values","cleanliness","check-in_boarding","food_beverage"]
for j in top_attri:
    temp_attri = []
    for i in review_sent["Attribute_Mentioned"]:
        if j in i:
            temp_attri.append(1)
        else:
            temp_attri.append(0)
    temp_attri = pd.Series(temp_attri)
    review_sent[j] = temp_attri

In [110]:
review_sent=review_sent.drop(columns="Review")

In [None]:
review_sent(10)