# Preprocess Airline General Information

In [None]:
from sqlalchemy import create_engine
import pandas as pd
import re
import json
import os 
from datetime import datetime

In [None]:
RAW_PATH = "../data/raw"
CLEAN_PATH = "../data/clean"
os.makedirs(CLEAN_PATH, exist_ok=True)

## 1. Read general information

In [None]:
vj_path = "vj_general_info.txt"
vna_path = "vna_general_info.txt"
bam_path = "bamboo_general_info.txt"

### VietJetAir information

In [None]:
with open(f"{RAW_PATH}/{vj_path}") as file:
    lines = file.readlines()
all_vj_text = ''.join(lines)
print(all_vj_text)

### Vietnam Airlines information

In [None]:
with open(f"{RAW_PATH}/{vna_path}") as file:
    lines = file.readlines()
all_vna_text = ''.join(lines)
print(all_vna_text)

### Bamboo Airways information

In [None]:
with open(f"{RAW_PATH}/{bam_path}") as file:
    lines = file.readlines()
all_bam_text = ''.join(lines)
print(all_bam_text)

## 2. Extract text and convert into DataFrame

In [None]:
def extract_info(original_path, file_name, airline_dict):
    print(file_name)
    with open(f"{original_path}/{file_name}", "r") as file:
        all_text = file.read()

    name = re.search(r"Name: (.+)\n", all_text)
    airline_dict["name"].append(name.group(1) if name else None)


    phone = re.search(r"Phone: ([\d\s]+)\n", all_text)
    airline_dict["phone"].append(phone.group(1) if phone else None)


    address = re.search(r"Headquarters: (.+)\n", all_text)
    airline_dict["address"].append(address.group(1) if address else None)

    website  = re.search(r"Website: (.+)\n", all_text)
    airline_dict["website"].append(website.group(1) if website else None)

    avg_rating = re.search(r"Average Rating: ([\d.]+)\n", all_text)
    airline_dict["averating_rating"].append(float(avg_rating.group(1)) if avg_rating else None)


    total_review = re.search(r"Total Review: ([\d,]+) reviews\n", all_text)
    airline_dict["total_review"].append(int(total_review.group(1).replace(',', '')) if total_review else None)


    popular_mention = re.search(r"Popular Mention: \[(.+)\]", all_text)
    if popular_mention:
        popular_mention = re.sub("'","",popular_mention.group(1))
        airline_dict["popular_mention"].append(popular_mention.split(", "))
    else :
        airline_dict["popular_mention"].append([])
        
    attributes = re.search(r"Attributes: (.+)", all_text)
    if attributes:
        attributes_text = attributes.group(1)
        attributes_text = attributes_text.replace("'", "\"")  
        airline_dict["attributes"].append(json.loads(attributes_text))
    else:
        airline_dict["attributes"].append({})

    detail_rate = re.search(r"Total Rating: (.+)", all_text)
    if detail_rate:
        detail_rate_text = detail_rate.group(1)
        detail_rate_text = detail_rate_text.replace("'", "\"") 
        airline_dict["rating"].append(json.loads(detail_rate_text))
    else:
        airline_dict["rating"].append({})

    return


info_headers = ['name', 'phone', 'address','website','averating_rating', 'total_review','popular_mention','attributes', 'rating']
airline_dict = {}
for header in info_headers:
    airline_dict[header] = []

for file in [vj_path, vna_path, bam_path]: 
    extract_info(RAW_PATH,file,airline_dict)

In [None]:
airline_df = pd.DataFrame(airline_dict)
airline_df

## 3. Extract Airline Information Table

In [None]:
airline_info_df = airline_df[['name','phone', 'address','website','averating_rating','total_review']]
airline_info_df

## 4. Extract Popular Mention Table

In [None]:
mention_df = airline_df[['name', 'popular_mention']].copy()
mention_df['airline_id'] = mention_df['name'].apply(lambda x: airline_info_df.loc[x == mention_df['name']].index[0])
mention_df.drop('name', axis=1, inplace=True)

mention_df = mention_df.explode('popular_mention', ignore_index=True)
mention_df

## 5. Extract Rating Table

In [None]:
rating_df = airline_df[['name','rating']].copy()
rating_df['airline_id'] = rating_df['name'].apply(lambda x : airline_info_df.loc[x == rating_df['name']].index[0])
rating_df.drop('name',axis=1,inplace=True)
all_sub_dfs = []
for _,row in rating_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'rate_name': rate_name, 'count': count} for rate_name, count in row['rating'].items()])
    all_sub_dfs.append(sub_df)

final_rating_df = pd.concat(all_sub_dfs, ignore_index=True)


final_rating_df

## 6. Extract Attribute Table

In [None]:
attribute_df = airline_df[['name','attributes']].copy()
attribute_df['airline_id'] = attribute_df['name'].apply(lambda x : airline_info_df.loc[x == attribute_df['name']].index[0])
attribute_df.drop('name',axis=1,inplace=True)
all_sub_attr_dfs = []
for _,row in attribute_df.iterrows():
    sub_df = pd.DataFrame([{'airline_id': row['airline_id'], 'attribute_name': attribute_name, 'rating': rate} for attribute_name, rate in row['attributes'].items()])
    all_sub_attr_dfs.append(sub_df)

final_attribute_df = pd.concat(all_sub_attr_dfs, ignore_index=True)

final_attribute_df['rating'] = final_attribute_df['rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
final_attribute_df

# Save all clean DataFrame

In [None]:
final_attribute_df.to_csv(f"{CLEAN_PATH}/attribute.csv")
final_rating_df.to_csv(f"{CLEAN_PATH}/rating.csv")
mention_df.to_csv(f"{CLEAN_PATH}/mention.csv")
airline_info_df.to_csv(f"{CLEAN_PATH}/info.csv")

# Insert into Database

In [None]:
server = "server"
database = "db"
username = "user"
password = "password"
driver = 17
mode = 'append'

In [None]:
def insert_into_sql_server(df, driver, server, database, username, password, mode, table_name) : 
    conn_str = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+{driver}+for+SQL+Server"
    engine = create_engine(conn_str)
    df.to_sql(name=table_name, con=engine, schema='dbo', if_exists=mode, index=False)

In [None]:
# insert_into_sql_server(final_attribute_df, driver, server, database, username, password, mode, 'attribute')
# insert_into_sql_server(final_rating_df, driver, server, database, username, password, mode, 'rating')
# insert_into_sql_server(mention_df, driver, server, database, username, password, mode, 'mention')
# insert_into_sql_server(airline_info_df, driver, server, database, username, password, mode, 'info')

# Preprocess Review Data

In [177]:
vj_review_path = "vna_all_reviews_data.csv"
vna_review_path = "vj_all_reviews_data.csv"
bam_review_path = "bamboo_all_reviews_data.csv"

## Read all sub review DataFrames and Combine all DataFrames

In [207]:
vj_review_df = pd.read_csv(f"{RAW_PATH}/{vj_review_path}")
vj_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,5.0 of 5 bubbles,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
1,1.0 of 5 bubbles,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,


In [208]:
vna_review_df = pd.read_csv(f"{RAW_PATH}/{vna_review_path}")
vna_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,1.0 of 5 bubbles,Never fly with this airline again,They weight your carry on at very last minute ...,Date of travel: March 2025,
1,1.0 of 5 bubbles,Worse flight experience,"Worse experience, they made my last memory of ...",Date of travel: March 2025,"[{'Service Rating': '2.0 of 5 bubbles', 'Servi..."


In [209]:
bam_review_df = pd.read_csv(f"{RAW_PATH}/{bam_review_path}")
bam_review_df.head(2)

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,1.0 of 5 bubbles,Don't do it,"Never again, 1st and last. \r\nFlight was book...",Date of travel: March 2025,"[{'Service Rating': '2.0 of 5 bubbles', 'Servi..."
1,1.0 of 5 bubbles,"Used to be good, now terrible.",When this airline first started it was a very ...,Date of travel: March 2025,


In [210]:
combined_df = pd.concat([vj_review_df, vna_review_df, bam_review_df], ignore_index=True)
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,5.0 of 5 bubbles,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
1,1.0 of 5 bubbles,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,
2,5.0 of 5 bubbles,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,Date of travel: March 2025,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi..."
3,4.0 of 5 bubbles,A Good Airline,"The plane was clean, the seating was comfortab...",Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
4,1.0 of 5 bubbles,Not worth it.,Wort experience ever. The food was terrible on...,Date of travel: March 2025,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi..."


In [211]:
combined_df

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,5.0 of 5 bubbles,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
1,1.0 of 5 bubbles,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,
2,5.0 of 5 bubbles,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,Date of travel: March 2025,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi..."
3,4.0 of 5 bubbles,A Good Airline,"The plane was clean, the seating was comfortab...",Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
4,1.0 of 5 bubbles,Not worth it.,Wort experience ever. The food was terrible on...,Date of travel: March 2025,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi..."
...,...,...,...,...,...
9588,4.0 of 5 bubbles,Good service with the service support of Turki...,"Service providing as advertised, friendly and ...",Date of travel: June 2019,
9589,4.0 of 5 bubbles,"Smile staffs, good flight",Our family had a great flight with the airline...,Date of travel: May 2019,"[{'Service Rating': '4.0 of 5 bubbles', 'Servi..."
9590,5.0 of 5 bubbles,"On time, professional and comfortabe.",Bamboo Airways is the newest airline in Vietna...,Date of travel: June 2019,"[{'Service Rating': '4.0 of 5 bubbles', 'Servi..."
9591,4.0 of 5 bubbles,02 Flights with good service,The staffs are very helpful and friendly. But ...,Date of travel: May 2019,"[{'Service Rating': '4.0 of 5 bubbles', 'Servi..."


In [212]:
combined_df['Rating'] = combined_df['Rating'].apply(lambda x: float(re.search(r"(\d\.\d) of",x).group(1)))
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,5.0,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
1,1.0,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",Date of travel: March 2025,
2,5.0,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,Date of travel: March 2025,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi..."
3,4.0,A Good Airline,"The plane was clean, the seating was comfortab...",Date of travel: March 2025,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
4,1.0,Not worth it.,Wort experience ever. The food was terrible on...,Date of travel: March 2025,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi..."


In [213]:
combined_df.dropna(subset=['Information'],inplace=True,ignore_index=True)
combined_df['Information'] = combined_df['Information'].apply(lambda x: datetime.strptime(re.search(r"Date of travel: (.+)",x).group(1), "%B %Y"))
combined_df.head()

Unnamed: 0,Rating,Title,Full Review,Information,Service Ratings
0,5.0,Took Care of Us in Difficult Situation,I was a little worried when we had an equipmen...,2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
1,1.0,Vietnam Airlines Experience Poor,"Checkin service staff at Saigon where rude, un...",2025-03-01 00:00:00,
2,5.0,passport show no month and date of my DOB,hello friends \r\ni have my vietnam passport b...,2025-03-01 00:00:00,"[{'Service Rating': '1.0 of 5 bubbles', 'Servi..."
3,4.0,A Good Airline,"The plane was clean, the seating was comfortab...",2025-03-01 00:00:00,"[{'Service Rating': '5.0 of 5 bubbles', 'Servi..."
4,1.0,Not worth it.,Wort experience ever. The food was terrible on...,2025-03-01 00:00:00,"[{'Service Rating': '3.0 of 5 bubbles', 'Servi..."


In [None]:
def preprocess(s):
    return s

In [None]:
combined_df['Title'] = combined_df['Title'].apply(lambda x: re.sub())
combined_df.head()