## Exploratory Data Analysis

In [None]:
#!pip install plotly
#!pip install WordCloud

: 

In [None]:
# Import Pandas library for data manipulation
import pandas as pd

# Import NumPy library for numerical operations
import numpy as np

# Import OS library for interacting with the operating system
import os

# Import Matplotlib library for data visualization
import matplotlib.pyplot as plt

# Import Seaborn library for statistical data visualization
import seaborn as sns

# Import Plotly Express library for interactive plotting
import plotly.express as px

# Import datetime module for working with dates and times
import datetime as dt

# Import WordCloud and STOPWORDS from the wordcloud library for creating word clouds
from wordcloud import WordCloud, STOPWORDS


: 

In [None]:
# Get the current working directory
cwd = os.getcwd()

# Read the CSV file into a DataFrame, specifying the first column as the index
df = pd.read_csv(cwd + "/cleaned-BA-reviews.csv", index_col=0)

# Reset the index of the DataFrame and drop the existing index
df = df.reset_index(drop=True)


: 

In [None]:
#Displayes the first few rows of data
df.head()

: 

#### What is the average overall rating given for British Airways?

In [None]:
df.stars.mean()

: 

#### What is the total counts for each ratings?


In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt

# Assuming you have already imported pandas and have a DataFrame named 'df'

# Plot the distribution of ratings
df.stars.value_counts().plot(kind="bar")
plt.xlabel("Ratings")
plt.ylabel("Total Number of reviews with that rating")
plt.title("Counts for each rating")  # Changed 'suptitle' to 'title'
#plt.show()  # Display the plot


: 

In [None]:
df_ratings = pd.DataFrame(df['stars'].value_counts())
pct_values = (df_ratings['stars'].values / df_ratings['stars'].values.sum() * 100).tolist()
pct_values = [round(x, 2) for x in pct_values]
df_ratings['pct_values'] = pct_values



: 

In [None]:
# Import necessary libraries
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Combine all reviews into a single string
reviews = " ".join(df.corpus)

# Set up the figure size
plt.figure(figsize=(20, 10))

# Define a set of English stopwords
stopwords = set(stopwords.words('english'))

# Create and generate a word cloud image
wordcloud = WordCloud(height=600, width=600, max_font_size=100, max_words=500, stopwords=stopwords).generate(reviews)

# Display the generated word cloud image
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


: 

There are many words that does not set the idea of whether the review is positive or negative. For example words like "passenger", "flight", etc. does not add conlcusive value hence we can include them in stopwords list. 


In [None]:
import nltk
from nltk.corpus import stopwords
reviews = " ".join(df.corpus)
plt.figure(figsize=(20,10))

stopwords = set(stopwords.words('english'))
stopwords.update(["ba","flight", "british","airway", "airline","plane", "told","also","passenger" \
                 "london", "heathrow", "aircraft", "could","even", "would"])
# Create and generate a word cloud image:
wordcloud = WordCloud(height=500,width=500,max_font_size=100, max_words=300, stopwords=stopwords).generate(reviews)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

: 

## Word Frequency

In [None]:
#!pip install scikit-learn

: 

In [None]:
# Import necessary libraries
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the text of all reviews into a list of words
words = reviews.split(" ")

# Remove certain words that will not be used to determine the positive or negative sentiment
stopwords = text.ENGLISH_STOP_WORDS.union(['flight', 'ba', "passenger", "u", "london", "airway", "british", "airline", \
                                           "heathrow", "plane", "lhr", "review"])

new_words = [word for word in words if word not in stopwords]

# Calculate the frequency distribution of the words
nlp_words = FreqDist(new_words).most_common(20)

# Create a DataFrame of these words and their frequencies
all_fdist = pd.Series(dict(nlp_words))


: 

In [None]:
# Setting figure and axis into variables
fig, ax = plt.subplots(figsize=(15, 8))

# Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)

# Add labels to the bars
all_plot.bar_label(all_plot.containers[0])

# Rotate x-axis labels for better visibility
plt.xticks(rotation=30)


: 

This gives us a glimpse of what customers are really talking about here. We see that Seat is most talked about the airline followed by "Service" and "food" which are all very important to customers in terms of service. However, we still do not know is how they are expressing about each of this service. To bring some significane to these terms we will use ngram plots to see if they are bad or good in experience. 

## Word Frequency with N-gram

In [None]:
# Imports
import nltk.collocations as collocations
from nltk import FreqDist, bigrams

# Combine all reviews into a single string
reviews = " ".join(df.corpus)

# Split the text of all reviews into a list of words
words = reviews.split(" ")

# Remove stopwords
new_words = [word for word in words if word not in stopwords]

# Function to get frequency distribution of n-grams and plot
def get_freq_dist(new_words, number_of_ngrams):
    from nltk import ngrams
    
    # Generate n-grams
    ngrams_list = ngrams(new_words, number_of_ngrams)

    # Create FreqDist
    ngram_fd = FreqDist(ngrams_list).most_common(40)

    # Sort values by highest frequency
    ngram_sorted = {k: v for k, v in sorted(ngram_fd, key=lambda item: item[1])}

    # Join n-gram tokens with '_' and maintain sorting
    ngram_joined = {'_'.join(k): v for k, v in sorted(ngram_fd, key=lambda item: item[1])}

    # Convert to Pandas Series for easy plotting
    ngram_freqdist = pd.Series(ngram_joined)
    
    # Plotting
    plt.figure(figsize=(10, 10))
    ax = ngram_freqdist.plot(kind="barh")
    
    return ax

# Call the function with new_words and the desired number of n-grams (in this case, 4)
get_freq_dist(new_words, 4)


: 

We can see that there are very common positive terms regarding cabin crew. For example, cabin_crew_friendly_helpful, cabin_crew_friendly_attentive, cabin_crew_friendly_efficient, etc. So certainly customers are providing good reviews about cabin crew staff of British Airways. 

In [None]:
ratings_1_3 = df[df.stars.isin([1,2,3])]
ratings_4_6 = df[df.stars.isin([4,5,6])]
ratings_7_10 = df[df.stars.isin([7,8,9,10])]

: 