# Preprocess Airline Review

In [113]:
from sqlalchemy import create_engine
import pandas as pd
import re
import json
import os 

In [114]:
RAW_PATH = "../data/raw"
CLEAN_PATH = "../data/clean"
os.makedirs(CLEAN_PATH, exist_ok=True)

## 1. Read general information

In [115]:
vj_path = "vj_general_info.txt"
vna_path = "vna_general_info.txt"
bam_path = "bamboo_general_info.txt"

### VietJetAir information

In [116]:
with open(f"{RAW_PATH}/{vj_path}") as file:
    lines = file.readlines()
all_vj_text = ''.join(lines)
print(all_vj_text)

Name: VietJetAir
Phone: 011 84 1900 1886
Address: Headquarters: 530 Nhat Tao, District 11, Ho Chi Minh City Vietnam
Website: http://www.vietjetair.com/
Average Rating: 2.0
Total Review: 6,909 reviews
Popular Mention: ['budget airline', 'chiang rai', 'low cost airline', 'bad reviews', 'low budget']
Attributes: {'Legroom': '2.5 of 5 bubbles', 'Seat comfort': '2.5 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '1.5 of 5 bubbles', 'Onboard Experience': '2.5 of 5 bubbles', 'Customer service': '2.5 of 5 bubbles', 'Value for money': '2.5 of 5 bubbles', 'Cleanliness': '3.0 of 5 bubbles', 'Check-in and boarding': '2.5 of 5 bubbles'}
Total Rating: {'Excellent': '632', 'Good': '948', 'Average': '731', 'Poor': '573', 'Terrible': '4,035'}



### Vietnam Airlines information

In [117]:
with open(f"{RAW_PATH}/{vna_path}") as file:
    lines = file.readlines()
all_vna_text = ''.join(lines)
print(all_vna_text)

Name: Vietnam Airlines
Phone: 011 84 8 3832 0320
Address: Headquarters: 200 Nguyen Son Str., Long Bien District, Hanoi Vietnam
Website: http://www.vietnamairlines.com/
Average Rating: 3.5
Total Review: 8,566 reviews
Popular Mention: ['da nang', 'ho chi minh', 'internal flights', 'phu quoc', 'lotus lounge', 'premium economy', 'domestic terminal', 'short flight', 'noodles', 'fish', 'flat bed', 'full service', 'through immigration', 'plenty of leg room', 'fruit', 'saigon', 'sgn', 'dreamliner', 'trang', 'hoi', 'english', 'visas', 'slippers', 'city', 'beds', 'sector']
Attributes: {'Legroom': '3.5 of 5 bubbles', 'Seat comfort': '3.5 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '3.0 of 5 bubbles', 'Onboard Experience': '3.5 of 5 bubbles', 'Customer service': '3.5 of 5 bubbles', 'Value for money': '4.0 of 5 bubbles', 'Cleanliness': '4.0 of 5 bubbles', 'Check-in and boarding': '4.0 of 5 bubbles'}
Total Rating: {'Excellent': '2,556', 'Good': '3,004', 'Average': '1,465', 'Poor': '

### Bamboo Airways information

In [118]:
with open(f"{RAW_PATH}/{bam_path}") as file:
    lines = file.readlines()
all_bam_text = ''.join(lines)
print(all_bam_text)

Name: Bamboo Airways
Phone: 011 84 1900 1166
Address: Headquarters: 265 Cau Giay Street, Bamboo Airways Tower, Dich Vong Ward, Cau Giay District, Quy Nhon Vietnam
Website: https://www.bambooairways.com/en/
Average Rating: 2.0
Total Review: 564 reviews
Popular Mention: ['phu quoc', 'ho chi minh', 'seating layout', 'wouldn t hesitate', 'food on offer', 'sandwich', 'service offered', 'bad reviews', 'affordable price', 'food beverage', 'from london gatwick', 'budget airline', 'light meal', 'great choice', 'great value for money', 'business class passengers', 'full refund', 'fish', 'hot meal', 'full service', 'great experience', 'great customer service', 'be honest', 'return journey', 'doesn t', 'decoration', 'sgn', 'saigon', 'escort', 'econ']
Attributes: {'Legroom': '3.0 of 5 bubbles', 'Seat comfort': '3.0 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '2.0 of 5 bubbles', 'Onboard Experience': '3.0 of 5 bubbles', 'Customer service': '2.5 of 5 bubbles', 'Value for money': '3.0

## Extract text and convert into DataFrame

In [119]:
popular_mention = re.search(r"Popular Mention: \[(.+)\]", all_bam_text)
re.sub("'","",popular_mention.group(1))

'phu quoc, ho chi minh, seating layout, wouldn t hesitate, food on offer, sandwich, service offered, bad reviews, affordable price, food beverage, from london gatwick, budget airline, light meal, great choice, great value for money, business class passengers, full refund, fish, hot meal, full service, great experience, great customer service, be honest, return journey, doesn t, decoration, sgn, saigon, escort, econ'

In [120]:
def extract_info(original_path, file_name, airline_dict):
    print(file_name)
    with open(f"{original_path}/{file_name}", "r") as file:
        all_text = file.read()

    name = re.search(r"Name: (.+)\n", all_text)
    airline_dict["name"].append(name.group(1) if name else None)


    phone = re.search(r"Phone: ([\d\s]+)\n", all_text)
    airline_dict["phone"].append(phone.group(1) if phone else None)


    address = re.search(r"Headquarters: (.+)\n", all_text)
    airline_dict["address"].append(address.group(1) if address else None)

    website  = re.search(r"Website: (.+)\n", all_text)
    airline_dict["website"].append(website.group(1) if website else None)

    avg_rating = re.search(r"Average Rating: ([\d.]+)\n", all_text)
    airline_dict["averating_rating"].append(float(avg_rating.group(1)) if avg_rating else None)


    total_review = re.search(r"Total Review: ([\d,]+) reviews\n", all_text)
    airline_dict["total_review"].append(int(total_review.group(1).replace(',', '')) if total_review else None)


    popular_mention = re.search(r"Popular Mention: \[(.+)\]", all_text)
    if popular_mention:
        popular_mention = re.sub("'","",popular_mention.group(1))
        airline_dict["popular_mention"].append(popular_mention.split(", "))
    else :
        airline_dict["popular_mention"].append([])
        
    attributes = re.search(r"Attributes: (.+)", all_text)
    if attributes:
        attributes_text = attributes.group(1)
        attributes_text = attributes_text.replace("'", "\"")  
        airline_dict["attributes"].append(json.loads(attributes_text))
    else:
        airline_dict["attributes"].append({})

    detail_rate = re.search(r"Total Rating: (.+)", all_text)
    if detail_rate:
        detail_rate_text = detail_rate.group(1)
        detail_rate_text = detail_rate_text.replace("'", "\"") 
        airline_dict["rating"].append(json.loads(detail_rate_text))
    else:
        airline_dict["rating"].append({})

    return


info_headers = ['name', 'phone', 'address','website','averating_rating', 'total_review','popular_mention','attributes', 'rating']
airline_dict = {}
for header in info_headers:
    airline_dict[header] = []

for file in [vj_path, vna_path, bam_path]: 
    extract_info(RAW_PATH,file,airline_dict)

vj_general_info.txt
vna_general_info.txt
bamboo_general_info.txt


In [121]:
airline_df = pd.DataFrame(airline_dict)
airline_df

Unnamed: 0,name,phone,address,website,averating_rating,total_review,popular_mention,attributes,rating
0,VietJetAir,011 84 1900 1886,"530 Nhat Tao, District 11, Ho Chi Minh City Vi...",http://www.vietjetair.com/,2.0,6909,"[budget airline, chiang rai, low cost airline,...","{'Legroom': '2.5 of 5 bubbles', 'Seat comfort'...","{'Excellent': '632', 'Good': '948', 'Average':..."
1,Vietnam Airlines,011 84 8 3832 0320,"200 Nguyen Son Str., Long Bien District, Hanoi...",http://www.vietnamairlines.com/,3.5,8566,"[da nang, ho chi minh, internal flights, phu q...","{'Legroom': '3.5 of 5 bubbles', 'Seat comfort'...","{'Excellent': '2,556', 'Good': '3,004', 'Avera..."
2,Bamboo Airways,011 84 1900 1166,"265 Cau Giay Street, Bamboo Airways Tower, Dic...",https://www.bambooairways.com/en/,2.0,564,"[phu quoc, ho chi minh, seating layout, wouldn...","{'Legroom': '3.0 of 5 bubbles', 'Seat comfort'...","{'Excellent': '111', 'Good': '56', 'Average': ..."


### Extract Airline Information Table

In [122]:
airline_info_df = airline_df[['name','phone', 'address','website','averating_rating','total_review']]
airline_info_df

Unnamed: 0,name,phone,address,website,averating_rating,total_review
0,VietJetAir,011 84 1900 1886,"530 Nhat Tao, District 11, Ho Chi Minh City Vi...",http://www.vietjetair.com/,2.0,6909
1,Vietnam Airlines,011 84 8 3832 0320,"200 Nguyen Son Str., Long Bien District, Hanoi...",http://www.vietnamairlines.com/,3.5,8566
2,Bamboo Airways,011 84 1900 1166,"265 Cau Giay Street, Bamboo Airways Tower, Dic...",https://www.bambooairways.com/en/,2.0,564


### Extract Popular Mention Table

In [123]:
mention_df = airline_df[['name', 'popular_mention']].copy()
mention_df['airline_id'] = mention_df['name'].apply(lambda x: airline_info_df.loc[x == mention_df['name']].index[0])
mention_df.drop('name', axis=1, inplace=True)

mention_df = mention_df.explode('popular_mention', ignore_index=True)
mention_df

Unnamed: 0,popular_mention,airline_id
0,budget airline,0
1,chiang rai,0
2,low cost airline,0
3,bad reviews,0
4,low budget,0
...,...,...
56,decoration,2
57,sgn,2
58,saigon,2
59,escort,2


### Extract Rating Table

In [124]:
rating_df = airline_df[['name','rating']].copy()
rating_df['airline_id'] = rating_df['name'].apply(lambda x : airline_info_df.loc[x == rating_df['name']].index[0])
rating_df.drop('name',axis=1,inplace=True)
all_sub_dfs = []
for _,row in rating_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'rate_name': rate_name, 'count': count} for rate_name, count in row['rating'].items()])
    all_sub_dfs.append(sub_df)

final_rating_df = pd.concat(all_sub_dfs, ignore_index=True)


final_rating_df

Unnamed: 0,airline_id,rate_name,count
0,0,Excellent,632
1,0,Good,948
2,0,Average,731
3,0,Poor,573
4,0,Terrible,4035
5,1,Excellent,2556
6,1,Good,3004
7,1,Average,1465
8,1,Poor,606
9,1,Terrible,939


### Extract Attribute Table

In [125]:
attribute_df = airline_df[['name','attributes']].copy()
attribute_df['airline_id'] = attribute_df['name'].apply(lambda x : airline_info_df.loc[x == attribute_df['name']].index[0])
attribute_df.drop('name',axis=1,inplace=True)
all_sub_attr_dfs = []
for _,row in attribute_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'attribute_name': attribute_name, 'rating': rate} for attribute_name, rate in row['attributes'].items()])
    all_sub_attr_dfs.append(sub_df)

final_attribute_df = pd.concat(all_sub_attr_dfs, ignore_index=True)

final_attribute_df['rating'] = final_attribute_df['rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
final_attribute_df

Unnamed: 0,airline_id,attribute_name,rating
0,0,Legroom,2.5
1,0,Seat comfort,2.5
2,0,"In-flight entertainment (WiFi, TV, movies)",1.5
3,0,Onboard Experience,2.5
4,0,Customer service,2.5
5,0,Value for money,2.5
6,0,Cleanliness,3.0
7,0,Check-in and boarding,2.5
8,1,Legroom,3.5
9,1,Seat comfort,3.5


## Save all clean DataFrame

In [126]:
final_attribute_df.to_csv(f"{CLEAN_PATH}/attribute.csv")
final_rating_df.to_csv(f"{CLEAN_PATH}/rating.csv")
mention_df.to_csv(f"{CLEAN_PATH}/mention.csv")
airline_info_df.to_csv(f"{CLEAN_PATH}/info.csv")