# Preprocess Airline General Information

In [None]:
from sqlalchemy import create_engine
import pandas as pd
import re
import json
import os 
from datetime import datetime

In [2]:
RAW_PATH = "../data/raw"
CLEAN_PATH = "../data/clean"
os.makedirs(CLEAN_PATH, exist_ok=True)

## 1. Read general information

In [3]:
vj_path = "vj_general_info.txt"
vna_path = "vna_general_info.txt"
bam_path = "bamboo_general_info.txt"

### VietJetAir information

In [4]:
with open(f"{RAW_PATH}/{vj_path}") as file:
    lines = file.readlines()
all_vj_text = ''.join(lines)
print(all_vj_text)

Name: VietJetAir
Phone: 011 84 1900 1886
Address: Headquarters: 530 Nhat Tao, District 11, Ho Chi Minh City Vietnam
Website: http://www.vietjetair.com/
Average Rating: 2.0
Total Review: 6,909 reviews
Popular Mention: ['budget airline', 'chiang rai', 'low cost airline', 'bad reviews', 'low budget']
Attributes: {'Legroom': '2.5 of 5 bubbles', 'Seat comfort': '2.5 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '1.5 of 5 bubbles', 'Onboard Experience': '2.5 of 5 bubbles', 'Customer service': '2.5 of 5 bubbles', 'Value for money': '2.5 of 5 bubbles', 'Cleanliness': '3.0 of 5 bubbles', 'Check-in and boarding': '2.5 of 5 bubbles'}
Total Rating: {'Excellent': '632', 'Good': '948', 'Average': '731', 'Poor': '573', 'Terrible': '4,035'}



### Vietnam Airlines information

In [5]:
with open(f"{RAW_PATH}/{vna_path}") as file:
    lines = file.readlines()
all_vna_text = ''.join(lines)
print(all_vna_text)

Name: Vietnam Airlines
Phone: 011 84 8 3832 0320
Address: Headquarters: 200 Nguyen Son Str., Long Bien District, Hanoi Vietnam
Website: http://www.vietnamairlines.com/
Average Rating: 3.5
Total Review: 8,566 reviews
Popular Mention: ['da nang', 'ho chi minh', 'internal flights', 'phu quoc', 'lotus lounge', 'premium economy', 'domestic terminal', 'short flight', 'noodles', 'fish', 'flat bed', 'full service', 'through immigration', 'plenty of leg room', 'fruit', 'saigon', 'sgn', 'dreamliner', 'trang', 'hoi', 'english', 'visas', 'slippers', 'city', 'beds', 'sector']
Attributes: {'Legroom': '3.5 of 5 bubbles', 'Seat comfort': '3.5 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '3.0 of 5 bubbles', 'Onboard Experience': '3.5 of 5 bubbles', 'Customer service': '3.5 of 5 bubbles', 'Value for money': '4.0 of 5 bubbles', 'Cleanliness': '4.0 of 5 bubbles', 'Check-in and boarding': '4.0 of 5 bubbles'}
Total Rating: {'Excellent': '2,556', 'Good': '3,004', 'Average': '1,465', 'Poor': '

### Bamboo Airways information

In [6]:
with open(f"{RAW_PATH}/{bam_path}") as file:
    lines = file.readlines()
all_bam_text = ''.join(lines)
print(all_bam_text)

Name: Bamboo Airways
Phone: 011 84 1900 1166
Address: Headquarters: 265 Cau Giay Street, Bamboo Airways Tower, Dich Vong Ward, Cau Giay District, Quy Nhon Vietnam
Website: https://www.bambooairways.com/en/
Average Rating: 2.0
Total Review: 564 reviews
Popular Mention: ['phu quoc', 'ho chi minh', 'seating layout', 'wouldn t hesitate', 'food on offer', 'sandwich', 'service offered', 'bad reviews', 'affordable price', 'food beverage', 'from london gatwick', 'budget airline', 'light meal', 'great choice', 'great value for money', 'business class passengers', 'full refund', 'fish', 'hot meal', 'full service', 'great experience', 'great customer service', 'be honest', 'return journey', 'doesn t', 'decoration', 'sgn', 'saigon', 'escort', 'econ']
Attributes: {'Legroom': '3.0 of 5 bubbles', 'Seat comfort': '3.0 of 5 bubbles', 'In-flight entertainment (WiFi, TV, movies)': '2.0 of 5 bubbles', 'Onboard Experience': '3.0 of 5 bubbles', 'Customer service': '2.5 of 5 bubbles', 'Value for money': '3.0

## 2. Extract text and convert into DataFrame

In [7]:
def extract_info(original_path, file_name, airline_dict):
    print(file_name)
    with open(f"{original_path}/{file_name}", "r") as file:
        all_text = file.read()

    name = re.search(r"Name: (.+)\n", all_text)
    airline_dict["name"].append(name.group(1) if name else None)


    phone = re.search(r"Phone: ([\d\s]+)\n", all_text)
    airline_dict["phone"].append(phone.group(1) if phone else None)


    address = re.search(r"Headquarters: (.+)\n", all_text)
    airline_dict["address"].append(address.group(1) if address else None)

    website  = re.search(r"Website: (.+)\n", all_text)
    airline_dict["website"].append(website.group(1) if website else None)

    avg_rating = re.search(r"Average Rating: ([\d.]+)\n", all_text)
    airline_dict["averating_rating"].append(float(avg_rating.group(1)) if avg_rating else None)


    total_review = re.search(r"Total Review: ([\d,]+) reviews\n", all_text)
    airline_dict["total_review"].append(int(total_review.group(1).replace(',', '')) if total_review else None)


    popular_mention = re.search(r"Popular Mention: \[(.+)\]", all_text)
    if popular_mention:
        popular_mention = re.sub("'","",popular_mention.group(1))
        airline_dict["popular_mention"].append(popular_mention.split(", "))
    else :
        airline_dict["popular_mention"].append([])
        
    attributes = re.search(r"Attributes: (.+)", all_text)
    if attributes:
        attributes_text = attributes.group(1)
        attributes_text = attributes_text.replace("'", "\"")  
        airline_dict["attributes"].append(json.loads(attributes_text))
    else:
        airline_dict["attributes"].append({})

    detail_rate = re.search(r"Total Rating: (.+)", all_text)
    if detail_rate:
        detail_rate_text = detail_rate.group(1)
        detail_rate_text = detail_rate_text.replace("'", "\"") 
        airline_dict["rating"].append(json.loads(detail_rate_text))
    else:
        airline_dict["rating"].append({})

    return


info_headers = ['name', 'phone', 'address','website','averating_rating', 'total_review','popular_mention','attributes', 'rating']
airline_dict = {}
for header in info_headers:
    airline_dict[header] = []

for file in [vj_path, vna_path, bam_path]: 
    extract_info(RAW_PATH,file,airline_dict)

vj_general_info.txt
vna_general_info.txt
bamboo_general_info.txt


In [8]:
airline_df = pd.DataFrame(airline_dict)
airline_df

Unnamed: 0,name,phone,address,website,averating_rating,total_review,popular_mention,attributes,rating
0,VietJetAir,011 84 1900 1886,"530 Nhat Tao, District 11, Ho Chi Minh City Vi...",http://www.vietjetair.com/,2.0,6909,"[budget airline, chiang rai, low cost airline,...","{'Legroom': '2.5 of 5 bubbles', 'Seat comfort'...","{'Excellent': '632', 'Good': '948', 'Average':..."
1,Vietnam Airlines,011 84 8 3832 0320,"200 Nguyen Son Str., Long Bien District, Hanoi...",http://www.vietnamairlines.com/,3.5,8566,"[da nang, ho chi minh, internal flights, phu q...","{'Legroom': '3.5 of 5 bubbles', 'Seat comfort'...","{'Excellent': '2,556', 'Good': '3,004', 'Avera..."
2,Bamboo Airways,011 84 1900 1166,"265 Cau Giay Street, Bamboo Airways Tower, Dic...",https://www.bambooairways.com/en/,2.0,564,"[phu quoc, ho chi minh, seating layout, wouldn...","{'Legroom': '3.0 of 5 bubbles', 'Seat comfort'...","{'Excellent': '111', 'Good': '56', 'Average': ..."


## 3. Extract Airline Information Table

In [9]:
airline_info_df = airline_df[['name','phone', 'address','website','averating_rating','total_review']]
airline_info_df

Unnamed: 0,name,phone,address,website,averating_rating,total_review
0,VietJetAir,011 84 1900 1886,"530 Nhat Tao, District 11, Ho Chi Minh City Vi...",http://www.vietjetair.com/,2.0,6909
1,Vietnam Airlines,011 84 8 3832 0320,"200 Nguyen Son Str., Long Bien District, Hanoi...",http://www.vietnamairlines.com/,3.5,8566
2,Bamboo Airways,011 84 1900 1166,"265 Cau Giay Street, Bamboo Airways Tower, Dic...",https://www.bambooairways.com/en/,2.0,564


## 4. Extract Popular Mention Table

In [10]:
mention_df = airline_df[['name', 'popular_mention']].copy()
mention_df['airline_id'] = mention_df['name'].apply(lambda x: airline_info_df.loc[x == mention_df['name']].index[0])
mention_df.drop('name', axis=1, inplace=True)

mention_df = mention_df.explode('popular_mention', ignore_index=True)
mention_df

Unnamed: 0,popular_mention,airline_id
0,budget airline,0
1,chiang rai,0
2,low cost airline,0
3,bad reviews,0
4,low budget,0
...,...,...
56,decoration,2
57,sgn,2
58,saigon,2
59,escort,2


## 5. Extract Rating Table

In [11]:
rating_df = airline_df[['name','rating']].copy()
rating_df['airline_id'] = rating_df['name'].apply(lambda x : airline_info_df.loc[x == rating_df['name']].index[0])
rating_df.drop('name',axis=1,inplace=True)
all_sub_dfs = []
for _,row in rating_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'rate_name': rate_name, 'count': count} for rate_name, count in row['rating'].items()])
    all_sub_dfs.append(sub_df)

final_rating_df = pd.concat(all_sub_dfs, ignore_index=True)


final_rating_df

Unnamed: 0,airline_id,rate_name,count
0,0,Excellent,632
1,0,Good,948
2,0,Average,731
3,0,Poor,573
4,0,Terrible,4035
5,1,Excellent,2556
6,1,Good,3004
7,1,Average,1465
8,1,Poor,606
9,1,Terrible,939


## 6. Extract Attribute Table

In [12]:
attribute_df = airline_df[['name','attributes']].copy()
attribute_df['airline_id'] = attribute_df['name'].apply(lambda x : airline_info_df.loc[x == attribute_df['name']].index[0])
attribute_df.drop('name',axis=1,inplace=True)
all_sub_attr_dfs = []
for _,row in attribute_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'attribute_name': attribute_name, 'rating': rate} for attribute_name, rate in row['attributes'].items()])
    all_sub_attr_dfs.append(sub_df)

final_attribute_df = pd.concat(all_sub_attr_dfs, ignore_index=True)

final_attribute_df['rating'] = final_attribute_df['rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
final_attribute_df

Unnamed: 0,airline_id,attribute_name,rating
0,0,Legroom,2.5
1,0,Seat comfort,2.5
2,0,"In-flight entertainment (WiFi, TV, movies)",1.5
3,0,Onboard Experience,2.5
4,0,Customer service,2.5
5,0,Value for money,2.5
6,0,Cleanliness,3.0
7,0,Check-in and boarding,2.5
8,1,Legroom,3.5
9,1,Seat comfort,3.5


# Preprocess Review Data

In [13]:
vj_review_path = "vna_all_reviews_data.csv"
vna_review_path = "vj_all_reviews_data.csv"
bam_review_path = "bamboo_all_reviews_data.csv"

## Read all sub review DataFrames and Combine all DataFrames

### Add Airline column

In [14]:
vj_review_df = pd.read_csv(f"{RAW_PATH}/{vj_review_path}")
vj_review_df['Airline'] = ['VietJetAir' for _ in range(vj_review_df.shape[0])]
vj_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline
0,5.0 of 5 bubbles,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir
1,1.0 of 5 bubbles,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,,VietJetAir


In [15]:
vna_review_df = pd.read_csv(f"{RAW_PATH}/{vna_review_path}")
vna_review_df['Airline'] = ['Vietnam Airlines' for _ in range(vna_review_df.shape[0])]
vna_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline
0,1.0 of 5 bubbles,Never fly with this airline again,They weight your carry on at very last minute ...,Date of travel: March 2025,,Vietnam Airlines
1,1.0 of 5 bubbles,Worse flight experience,"Worse experience, they made my last memory of ...",Date of travel: March 2025,"[{'Service Rating': '2.0 of 5 bubbles', 'Servi...",Vietnam Airlines


In [16]:
bam_review_df = pd.read_csv(f"{RAW_PATH}/{bam_review_path}")
bam_review_df['Airline'] = ['Bamboo Airways' for _ in range(bam_review_df.shape[0])]
bam_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline
0,1.0 of 5 bubbles,Don't do it,"Never again, 1st and last. \r\nFlight was book...",Date of travel: March 2025,"[{'Service Rating': '2.0 of 5 bubbles', 'Servi...",Bamboo Airways
1,1.0 of 5 bubbles,"Used to be good, now terrible.",When this airline first started it was a very ...,Date of travel: March 2025,,Bamboo Airways


In [17]:
combined_df = pd.concat([vj_review_df, vna_review_df, bam_review_df], ignore_index=True)
combined_df['airline_id'] = combined_df['Airline'].apply(lambda x : airline_info_df.loc[x == airline_info_df['name']].index[0])
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline,airline_id
0,5.0 of 5 bubbles,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
1,1.0 of 5 bubbles,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,,VietJetAir,0
2,5.0 of 5 bubbles,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,Date of travel: March 2025,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi...",VietJetAir,0
3,4.0 of 5 bubbles,A Good Airline,"The plane was clean, the seating was comfortab...",Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
4,1.0 of 5 bubbles,Not worth it.,Wort experience ever. The food was terrible on...,Date of travel: March 2025,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi...",VietJetAir,0


### Extract Rating Column

In [18]:
combined_df['Rating'] = combined_df['Rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline,airline_id
0,5.0,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
1,1.0,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,,VietJetAir,0
2,5.0,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,Date of travel: March 2025,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi...",VietJetAir,0
3,4.0,A Good Airline,"The plane was clean, the seating was comfortab...",Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
4,1.0,Not worth it.,Wort experience ever. The food was terrible on...,Date of travel: March 2025,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi...",VietJetAir,0


### Extract date information

In [19]:
combined_df.dropna(subset=['Information'],inplace=True,ignore_index=True)
combined_df['Information'] = combined_df['Information'].apply(lambda x: datetime.strptime(re.search(r"Date of travel: (.+)",x).group(1), "%B %Y"))
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline,airline_id
0,5.0,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
1,1.0,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",2025-03-01 00:00:00,,VietJetAir,0
2,5.0,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,2025-03-01 00:00:00,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi...",VietJetAir,0
3,4.0,A Good Airline,"The plane was clean, the seating was comfortab...",2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
4,1.0,Not worth it.,Wort experience ever. The food was terrible on...,2025-03-01 00:00:00,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi...",VietJetAir,0


### Preprocess textual columns

In [20]:
def preprocess(s):
    if s is None: 
        return ''
    s = s.lower()
    s = re.sub(r'\s+',' ',s)
    s = re.sub(r'[^\w\s]', '', s)
    return s

In [21]:
combined_df['Title'] = combined_df['Title'].apply(lambda x: preprocess(x))
combined_df['Full Review'] = combined_df['Full Review'].apply(lambda x: preprocess(x))
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings,Airline,airline_id
0,5.0,took care of us in difficult situation,i was a little worried when we had an equipmen...,2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
1,1.0,vietnam airlines experience poor,checkin service staff at saigon where rude unh...,2025-03-01 00:00:00,,VietJetAir,0
2,5.0,passport show no month and date of my dob,hello friends i have my vietnam passport but d...,2025-03-01 00:00:00,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi...",VietJetAir,0
3,4.0,a good airline,the plane was clean the seating was comfortabl...,2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi...",VietJetAir,0
4,1.0,not worth it,wort experience ever the food was terrible on ...,2025-03-01 00:00:00,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi...",VietJetAir,0


### Extract Service Rating DataFrame

In [22]:
service_df = combined_df[['airline_id','Service Ratings','Information']].copy()
service_df.dropna(subset=['Service Ratings'],inplace=True,ignore_index=True)

service_df['Service Ratings'] = service_df['Service Ratings'].apply(lambda x: json.loads(x.replace("'", "\"") ))
all_sub_service_dfs = []
for _,row in service_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'service_name': service['Service Info'], 'rating': service['Service Rating']} for service in row['Service Ratings']])
    all_sub_service_dfs.append(sub_df)

final_service_df = pd.concat(all_sub_service_dfs, ignore_index=True)

final_service_df['rating'] = final_service_df['rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
final_service_df.head()

Unnamed: 0,airline_id,service_name,rating
0,0,Legroom,5.0
1,0,Seat comfort,5.0
2,0,In-flight Entertainment,4.0
3,0,Customer service,5.0
4,0,Value for money,5.0


### Drop unnecessary columns

In [23]:
combined_df.drop(['Service Ratings', 'Airline'], axis=1, inplace=True)

# Save all clean DataFrame

In [24]:
os.makedirs(os.path.join(CLEAN_PATH, "review_airline"), exist_ok=True)

In [25]:
final_attribute_df.to_csv(f"{CLEAN_PATH}/review_airline/attribute.csv",index=False)
final_rating_df.to_csv(f"{CLEAN_PATH}/review_airline/rating.csv",index=False)
mention_df.to_csv(f"{CLEAN_PATH}/review_airline/mention.csv",index=False)
airline_info_df.to_csv(f"{CLEAN_PATH}/info.csv",index=False)
combined_df.to_csv(f"{CLEAN_PATH}/all_airlines_review_cleaned.csv",index=False)
final_service_df.to_csv(f"{CLEAN_PATH}/review_airline/review_service.csv",index=False)

# Insert into Database (SQL Server)

In [26]:
from sqlalchemy import create_engine
from dotenv import load_dotenv

load_dotenv()

server = os.getenv("DB_SERVER")
database = os.getenv("DB_NAME")
username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
driver = 17
mode = 'replace' # 'fail' 'replace' 'append'

In [27]:
def insert_into_sql_server(df, driver, server, database, username, password, mode, table_name) : 
    conn_str = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+{driver}+for+SQL+Server"
    engine = create_engine(conn_str)
    df.to_sql(name=table_name, con=engine, schema='dbo', if_exists=mode, index=False)

In [28]:
insert_into_sql_server(mention_df, driver, server, database, username, password, mode, "MENTION")
insert_into_sql_server(final_rating_df, driver, server, database, username, password, mode, "RATING")
insert_into_sql_server(combined_df, driver, server, database, username, password, mode, "AIRLINE_REVIEW")
insert_into_sql_server(airline_info_df, driver, server, database, username, password, mode, "INFO")
insert_into_sql_server(final_attribute_df, driver, server, database, username, password, mode, "ATTRIBUTE")
insert_into_sql_server(final_service_df, driver, server, database, username, password, mode, "REVIEW_SERVICE")