In [14]:
from bs4 import BeautifulSoup
import requests
import xml.etree.ElementTree as ET
import pymongo
import pandas as pd
from sqlalchemy import create_engine

In [114]:
client = pymongo.MongoClient('192.168.56.30', 27017)
db = client.climate
collection = db.air_pollution_collection_1965_2019

with open('air_pollution_1965_2019.xml', 'rb') as xml_file:
    xml_data = xml_file.read()

# Insert into MongoDB
document = {'air_pollution_1965_2019': xml_data}
collection.insert_one(document)

InsertOneResult(ObjectId('656f6d47b322727d609aae99'), acknowledged=True)

In [115]:
document = collection.find_one()
xml_data = document['air_pollution_1965_2019']
root = ET.fromstring(xml_data)

In [111]:
record_elements = root.findall('.//record')
first_record = record_elements[0] if record_elements else None

# Extract unique column names from the first record
headings = [field.attrib['name'] for field in first_record.findall('.//field')] if first_record else []

data = []

for record in record_elements:
    row_data = [field.text for field in record.findall('.//field')]
    data.append(row_data)

air_poll_df_1965_2019 = pd.DataFrame(data, columns=headings)
air_poll_df_1965_2019


Unnamed: 0,Country or Area,Item,Year,Value
0,Aruba,"PM2.5 air pollution, mean annual exposure (mic...",1960,
1,Aruba,"PM2.5 air pollution, mean annual exposure (mic...",1961,
2,Aruba,"PM2.5 air pollution, mean annual exposure (mic...",1962,
3,Aruba,"PM2.5 air pollution, mean annual exposure (mic...",1963,
4,Aruba,"PM2.5 air pollution, mean annual exposure (mic...",1964,
...,...,...,...,...
16753,Zimbabwe,"PM2.5 air pollution, mean annual exposure (mic...",2018,22.08555546
16754,Zimbabwe,"PM2.5 air pollution, mean annual exposure (mic...",2019,20.83469969
16755,Zimbabwe,"PM2.5 air pollution, mean annual exposure (mic...",2020,
16756,Zimbabwe,"PM2.5 air pollution, mean annual exposure (mic...",2021,


In [118]:
air_poll_df_1965_2019.drop(['Item'], axis=1, inplace=True)
air_poll_df_1965_2019['Value'] = pd.to_numeric(air_poll_df_1965_2019['Value'])
air_poll_df_1965_2019['Value'] = air_poll_df_1965_2019['Value'].apply(lambda x: round(x, 2) if not pd.isna(x) else None)
air_poll_df_1965_2019


Unnamed: 0,Country or Area,Year,Value
0,Aruba,1960,
1,Aruba,1961,
2,Aruba,1962,
3,Aruba,1963,
4,Aruba,1964,
...,...,...,...
16753,Zimbabwe,2018,22.09
16754,Zimbabwe,2019,20.83
16755,Zimbabwe,2020,
16756,Zimbabwe,2021,


In [119]:
air_poll_df_1965_2019.dtypes

Country or Area     object
Year                object
Value              float64
dtype: object

In [120]:
air_poll_df_1965_2019['Year'] = air_poll_df_1965_2019['Year'].astype(int)
air_poll_2010_2019 = air_poll_df_1965_2019[(air_poll_df_1965_2019['Year'] >= 2010) & (air_poll_df_1965_2019['Year'] <= 2019)]
air_poll_2010_2019


Unnamed: 0,Country or Area,Year,Value
50,Aruba,2010,
51,Aruba,2011,
52,Aruba,2012,
53,Aruba,2013,
54,Aruba,2014,
...,...,...,...
16750,Zimbabwe,2015,25.93
16751,Zimbabwe,2016,25.33
16752,Zimbabwe,2017,22.58
16753,Zimbabwe,2018,22.09


In [121]:
print(air_poll_2010_2019.columns)
# air_poll_2010_2019.reset_index(inplace=True)
# air_poll_2010_2019.drop(['index'], axis=1, inplace=True)
# air_poll_2010_2019


Index(['Country or Area', 'Year', 'Value'], dtype='object')


In [122]:

air_poll_10_19 = air_poll_2010_2019.pivot(index='Country or Area', columns='Year', values='Value')
air_poll_10_19 = air_poll_10_19.reset_index()
air_poll_10_19.columns.name = None
air_poll_10_19

Unnamed: 0,Country or Area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,51.82,56.25,54.70,58.79,61.87,60.60,57.20,53.36,52.80,52.42
1,Africa Eastern and Southern,31.97,33.21,33.83,33.12,31.78,33.50,32.47,31.46,31.37,30.88
2,Africa Western and Central,58.31,59.96,61.99,54.89,54.86,68.84,64.27,62.76,63.02,63.36
3,Albania,21.63,23.41,21.56,20.05,19.94,19.54,17.82,19.02,18.82,18.64
4,Algeria,31.94,31.47,32.49,30.35,31.88,33.44,32.79,32.55,32.66,32.83
...,...,...,...,...,...,...,...,...,...,...,...
261,West Bank and Gaza,36.06,33.87,32.78,35.79,34.50,35.32,31.47,32.15,31.89,31.30
262,World,42.97,44.67,46.33,47.42,47.65,46.47,45.70,42.84,43.08,42.81
263,"Yemen, Rep.",44.69,48.89,54.41,51.40,43.70,48.33,44.95,46.44,45.14,44.47
264,Zambia,27.03,28.11,28.36,27.29,27.35,28.26,27.41,26.18,26.35,25.93


In [None]:
# Webscrape table from wikipedia for values for 2020 - 2022 to add on

In [123]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_air_pollution'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
air_pollution_table = tables[0]
root = ET.Element("table_data")

In [124]:
headings = air_pollution_table.find_all('th')
headings_element = ET.SubElement(root, "headings")
for heading in headings:
    heading_text = heading.get_text(strip=True)
    heading_element = ET.SubElement(headings_element, "heading")
    heading_element.text = heading_text

In [125]:
rows = air_pollution_table.find_all('tr')[1:]  # Skip headings
rows_element = ET.SubElement(root, "rows")

for row in rows:
    row_element = ET.SubElement(rows_element, "row")
    cells = row.find_all(['td', 'th'])
    for cell in cells:
        cell_text = cell.get_text(strip=True)
        cell_element = ET.SubElement(row_element, "cell")
        cell_element.text = cell_text

In [126]:
tree = ET.ElementTree(root)

with open('air_pollution.xml', "wb") as xml_f:
    tree.write(xml_f, xml_declaration=True)

In [127]:
client = pymongo.MongoClient('192.168.56.30', 27017)
db = client.climate
collection = db.air_pollution_collection

with open('air_pollution.xml', 'rb') as xml_f:
    xml_data = xml_f.read()
    document = {'air_pollution': xml_data}
    collection.insert_one(document)

In [128]:
document = collection.find_one()
xml_data = document['air_pollution']
root = ET.fromstring(xml_data)
headings = [heading.text for heading in root.findall('.//headings/heading')]

In [129]:
data = []

for row in root.findall('.//rows/row'):
    row_data = [cell.text for cell in row.findall('.//cell')]
    data.append(row_data)

air_poll_19_22 = pd.DataFrame(data, columns=headings)
air_poll_19_22

Unnamed: 0,Rank,Country/Region,2022,2021,2020,2019,2018,Population
0,1,Chad,89.7,75.9,--,--,--,17179740
1,2,Iraq,80.1,49.7,--,39.6,--,43533592
2,3,Pakistan,70.9,66.8,59,65.8,74.3,231402117
3,4,Bahrain,66.6,49.8,39.7,46.8,59.8,1463265
4,5,Bangladesh,65.8,76.9,77.1,83.3,97.1,169356251
...,...,...,...,...,...,...,...,...
126,127,"Bonaire, Saint Eustatius and Saba",3.3,5.1,--,--,--,26221
127,128,Bermuda,3,--,--,--,--,63867
128,129,U.S. Virgin Islands,2.9,4.5,3.7,3.5,--,105870
129,130,French Polynesia,2.5,--,--,--,--,304032


In [131]:
air_poll_df = air_poll_10_19.merge(air_poll_19_22[['Country/Region', '2020', '2021', '2022']], 
                                 left_on='Country or Area', right_on='Country/Region', how='inner')

air_poll_df = air_poll_df.drop(columns='Country/Region')
air_poll_df

Unnamed: 0,Country or Area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Albania,21.63,23.41,21.56,20.05,19.94,19.54,17.82,19.02,18.82,18.64,16,12.5,14.5
1,Algeria,31.94,31.47,32.49,30.35,31.88,33.44,32.79,32.55,32.66,32.83,20.2,20,17.8
2,Andorra,11.26,12.31,11.21,10.22,9.59,10.35,8.94,9.14,9.18,9.07,7.4,7.3,5.4
3,Angola,28.63,28.81,29.11,29.21,28.24,29.10,29.18,28.04,28.51,28.42,13,11,8.8
4,Argentina,13.74,14.19,13.36,14.70,13.78,14.59,14.11,13.61,13.63,13.51,14.2,8.2,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,United Arab Emirates,43.53,45.90,47.64,44.35,41.10,45.91,43.22,43.82,43.81,43.67,29.2,36,45.9
108,United Kingdom,12.43,13.00,11.43,11.42,10.89,10.37,10.21,10.36,10.17,10.09,8.3,8.8,8.9
109,Uruguay,9.42,9.68,9.30,10.33,9.89,9.94,9.90,9.54,9.58,9.53,--,14.2,11.3
110,Uzbekistan,32.29,41.44,39.02,39.48,37.50,35.49,35.12,35.07,34.78,34.79,29.9,42.8,33.5


In [None]:
#Upload database to postgres