In [None]:
import requests
from bs4 import BeautifulSoup

base_url = "https://www.airlinequality.com/airline-reviews/british-airways/"
max_pages = 100
reviews = []

for i in range(1, max_pages+1):
    response = requests.get(f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/")
    if response:
        content = response.content
        parsed_content = BeautifulSoup(content,"html.parser")
        articles = parsed_content.find_all("article",{"class":"comp_media-review-rated"})
        for article in articles:
            para = article.find("div",{"class":"text_content"})
            char_index = para.text.find("|")
            if char_index != -1:
                reviews.append(para.text[char_index+1:])
            else:
                reviews.append(para.text)

In [None]:
import pandas as pd
dataset = pd.DataFrame()
dataset["reviews"] = reviews
dataset.head()

Machine Learning Model for Sentiment Analysis

In [None]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = "ISO-8859-1")
df.head()

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords.words("english")

In [None]:
df.replace({"target":{4:1}},inplace = True)
df.head(10)
df.rename(columns={df.columns[-1]: "text"}, inplace=True)
df.rename(columns={df.columns[0]:"target"},inplace=True)
df.head()

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
port_stem = PorterStemmer()

def stemming(row): 
    content = re.sub('[^a-zA-Z]', ' ', row['text']) 
    content = content.lower() 
    content = content.split() 
    stop_words = set(stopwords.words('english')) 
    content = [port_stem.stem(word) for word in content if word not in stop_words] 
    content = ' '.join(content)
    if row.name%100000 == 0:
        print(f"cleaned {row.name}")
    return content

In [None]:
df['stemmed_text'] = df.apply(stemming, axis=1)

In [None]:
df.head()

In [None]:
df.rename(columns={df.columns[0]:"target"},inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['stemmed_text'])
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, stratify = y,random_state=2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
type(X_train)

In [None]:
y

In [None]:
X_train

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

X_train_pred = model.predict(X_train)
trainning_accuracy = accuracy_score(y_train,X_train_pred)
print(trainning_accuracy)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

In [None]:
Model accuracy : 77.7%

In [None]:
import pickle
pickle.dump(model,open("sentiment_model.pkl","wb"))
pickle.dump(vectorizer,open("vectorizer.pkl","wb"))

BRITISH AIRWAYS REVIEW SENTIMENT ANALYSIS

In [None]:
dataset.head()

In [None]:
dataset.columns

In [None]:
model = pickle.load(open("sentiment_model.pkl", "rb"))
vectorize_model = pickle.load(open("vectorizer.pkl", "rb"))
def get_prediction(row):
    text = row['reviews']
    encoding = vectorize_model.transform([text])
    prediction = model.predict(encoding) 
    if row.name % 100 == 0: 
        print(f"Completed {row.name} rows") 
    return prediction[0]
dataset['sentiment'] = dataset.apply(get_prediction, axis=1)

In [None]:
dataset.head()

In [None]:
from matplotlib import pyplot as plt
dataset['sentiment'].value_counts().plot(kind = 'bar',title = "Sentiment analysis of BA")

In [None]:
dataset['sentiment'].value_counts()

In [None]:
from sentence_transformers import SentenceTransformer,util

complaints = [
    "The flight was delayed by several hours.",
    "My flight got canceled at the last minute.",
    "My luggage was lost during transit.",
    "My baggage was damaged upon arrival.",
    "The customer service was rude and unhelpful.",
    "The seats were too cramped and uncomfortable.",
    "The in-flight food was of poor quality.",
    "There was no variety in the food options.",
    "The aircraft was not clean.",
    "The restroom was dirty and poorly maintained.",
    "I was charged unexpected fees for my baggage.",
    "I had to pay extra for seat selection.",
    "Changing my flight was very difficult and expensive.",
    "The in-flight entertainment system was not working.",
    "There were very few entertainment options available.",
    "The boarding process was chaotic and disorganized.",
    "It took too long to disembark from the plane.",
    "I did not feel safe during the flight.",
    "The security procedures were inadequate.",
    "I had issues with the frequent flyer program.",
    "My frequent flyer points were not credited properly.",
    "There were no rewards available for my frequent flyer points.",
    "The check-in process was slow and inefficient.",
    "The flight attendants were inattentive and unfriendly.",
    "The Wi-Fi on the plane was very slow or didn't work at all.",
    "My seat did not recline properly.",
    "The armrests were broken.",
    "I had no legroom during the flight.",
    "The flight was overbooked, and I was bumped off.",
    "The cabin temperature was too hot or too cold.",
    "The overhead bins were full, and I had to check my carry-on.",
    "The plane was very noisy during the flight.",
    "I was not informed about gate changes.",
    "I had difficulty understanding the announcements.",
    "The pilot provided very few updates about the flight status.",
    "The inflight safety demonstration was unclear.",
    "The baby crying throughout the flight disturbed me."
]

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding = embed_model.encode(complaints)

In [None]:
labels = [
    "Delay",
    "Cancellation",
    "LostLuggage",
    "DamagedLuggage",
    "CustomerService",
    "CrampedSeats",
    "FoodQuality",
    "FoodVariety",
    "UncleanAircraft",
    "DirtyRestroom",
    "ExtraFees",
    "SeatSelection",
    "FlightChange",
    "EntertainmentIssue",
    "FewEntertainment",
    "BoardingProcess",
    "Disembarking",
    "SafetyConcern",
    "Security",
    "FrequentFlyer",
    "PointsCredit",
    "NoRewards",
    "CheckIn",
    "Attendants",
    "WiFi",
    "SeatRecline",
    "BrokenArmrest",
    "NoLegroom",
    "Overbooked",
    "CabinTemperature",
    "FullBins",
    "Noise",
    "GateChange",
    "Announcements",
    "PilotUpdates",
    "SafetyDemo",
    "CryingBaby"
]

In [None]:
def compute_similarity_score(row, complaint_embeddings, model): 
    text = row['reviews'] # Assuming 'reviews' is the column with customer complaints 
    embedding = model.encode([text])[0] 
    similarities = util.cos_sim(embedding, complaint_embeddings) 
    max_similarity_index = np.argmax(similarities.numpy()) 
    max_similarity_score = similarities[0][max_similarity_index].item() 
    if row['sentiment'] == 0:
        return labels[max_similarity_index]
    return "No Complaint"
dataset['complaint_type'] = dataset.apply(compute_similarity_score,axis=1,complaint_embeddings=embedding,model = embed_model)

In [None]:
dataset.head()

In [None]:
dataset['complaint_type'].value_counts().plot(kind="bar",title="Complaint analysis")

In [None]:
dataset['complaint_type'].value_counts()