## IMPORTS

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## SCRAPING TABLES

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_airline_codes'

page = requests.get(URL)
 
# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
tables = soup.findAll("table", { "class" : "wikitable" })
table  = str(tables[0])

table = table.replace("\n",'')
table = table.replace("</tr>", '')
table = table.replace("</td>", '')
table = table.replace("n/a", '')


data  = [r.split('<td>') for r in table.split("<tr>")][2:]

In [4]:
df = pd.DataFrame(data)[[2,3,5,6]]
df.rename(columns={2:"ICAO",5:"Country",3:"Name",6:"Comments"}, inplace=True)

In [5]:
df

Unnamed: 0,ICAO,Name,Country,Comments
0,BOI,"<a href=""/wiki/2GO_(cargo_airline)"" title=""2GO...",Philippines,
1,EVY,"<a href=""/wiki/No._34_Squadron_RAAF"" title=""No...",,
2,GNL,"<a class=""new"" href=""/w/index.php?title=135_Ai...",United States,
3,,Amadeus IT Group S.A.,Global,GDS and airline hosting system (CRS/PSS)
4,,Sabre travel network Asia-Pacific (ex-Abacus),APAC,Regional distribution
...,...,...,...,...
6291,CYN,"<a href=""/wiki/Zhongyuan_Airlines"" title=""Zhon...",China,ICAO Code and callsign no longer allocated
6292,WZP,"<a href=""/wiki/Zip_(airline)"" title=""Zip (airl...",Canada,ICAO Code and callsign no longer allocated
6293,TZT,"<a class=""mw-redirect"" href=""/wiki/ZIPAIR_Toky...",Japan,"Subsidiary of <a href=""/wiki/Japan_Airlines"" t..."
6294,OOM,"<a href=""/wiki/Zoom_Airlines"" title=""Zoom Airl...",Canada,"defunct, ICAO Code and callsign no longer allo..."


## Drop all airlines that doesn't exist anymore

In [6]:
df1 = df.drop_duplicates(subset=["Name"]).dropna()
df1 = df1[["ICAO","Country","Comments"]]

to_drop = []
to_drop += list(df1[df1["Comments"].str.contains("Ceased")].index)
to_drop += list(df1[df1["Comments"].str.contains("ceased")].index)
to_drop += list(df1[df1["Comments"].str.contains("defunct")].index)
to_drop += list(df1[df1["Comments"].str.contains("Defunct")].index)
to_drop += list(df1[df1["Comments"].str.contains("no")].index)
to_drop += list(df1[df1["Comments"].str.contains("No")].index)
to_drop += list(df1[df1["Comments"].str.contains("merged")].index)
to_drop += list(df1[df1["Comments"].str.contains("Merged")].index)
to_drop += list(df1[df1["Comments"].str.contains("DEFUNCT")].index)

df1 = df1.drop(to_drop).dropna(subset=["ICAO","Country"]).sort_values("ICAO").reset_index(drop=True).drop(list(range(98)))
df1 = df1.drop_duplicates(["ICAO","Country"])
df1

Unnamed: 0,ICAO,Country,Comments
98,AAC,United Kingdom,
99,AAD,United Kingdom,t/a Ambassador
100,AAF,France,"Former name: <a class=""mw-redirect"" href=""/wik..."
101,AAG,United Kingdom,"Former name: <a class=""mw-redirect"" href=""/wik..."
102,AAH,United States,
...,...,...,...
5573,ZAW,Bangladesh,
5574,ZBA,Kenya,
5575,ZMA,Zambia,
5576,ZZM,Ivory Coast,


## Fix hyperlinks issues

In [7]:
df1 = df1.reset_index(drop=True)
df1[df1["Country"].str.contains("<a")]

Unnamed: 0,ICAO,Country,Comments
1576,FEG,"<a href=""/wiki/Egypt"" title=""Egypt"">Egypt</a>",
2886,MAY,"<a href=""/wiki/Malta"" title=""Malta"">Malta</a>",2019.0
3767,"PVV<sup class=""reference"" id=""cite_ref-24""><a ...","<a href=""/wiki/Moldova"" title=""Moldova"">Moldov...",
4904,TWB,"<a class=""mw-redirect"" href=""/wiki/Republic_of...",


In [8]:
df1["Country"].iat[3767] = "Moldova"
df1["Country"].iat[1576] = "Egypt"
df1["Country"].iat[2886] = "Malta"
df1["Country"].iat[4904] = "Republic of Korea"

In [9]:
df1[df1["ICAO"].str.contains("<")]

Unnamed: 0,ICAO,Country,Comments
3767,"PVV<sup class=""reference"" id=""cite_ref-24""><a ...",Moldova,


In [10]:
df1["ICAO"].iat[3767] = "PVV"

## FIX COUNTRIES NAMES 

In [11]:
def correctWiki(df,nameWiki,nameCountry):
    L_country = list(df[df["Country"].str.contains(nameWiki, case=False)].index)
    for l in L_country:
        df["Country"].iloc[l] = nameCountry
        

In [12]:
# df =  df[df["Country"].str.contains('[A-Z]+', regex=True)]
correctWiki(df,"congo","Congo")
correctWiki(df,"turks","Turks And Caicos Islands")
correctWiki(df,"São","Sao Tome And Principe")
correctWiki(df,"Republic of Korea","South Korea")
correctWiki(df,"Lao","Laos")
correctWiki(df,"Trinidad","Trinidad And Tobago")
correctWiki(df,"Somali","Somalia")
correctWiki(df,"gambia","Gambia")
correctWiki(df,"Syria","Syria")
correctWiki(df,"Syria","Syria")
correctWiki(df,"Russia","Russia")
correctWiki(df,"Syria","Syria")
correctWiki(df,"Myanmar","Myanmar (Burma)")
correctWiki(df,"Syria","Syria")
correctWiki(df,"ivory","Cote D'ivoire (Ivory Coast)")
correctWiki(df,"ivoire","Cote D'ivoire (Ivory Coast)")
correctWiki(df,"Czech","Czechia")
correctWiki(df,"Molodva","Moldova")
correctWiki(df,"Canada","Canada")
correctWiki(df,"Hong","Hong Kong")
correctWiki(df,"Netherlands","Netherlands")
correctWiki(df,"Burma","Myanmar (Burma)")
correctWiki(df,"Burkino Faso","Burkina Faso")
correctWiki(df,"Bosnia and Herzegovina","Bosnia And Herzegovina")
correctWiki(df,"Antigua and Barbuda","Antigua and Barbuda")
correctWiki(df,"Macedonia","North Macedonia")
df = df.drop([5053])

# SAVE

In [13]:
df1.to_parquet("AirlineCountries.parquet")