# W3-pipelines-project

## 1. Import libraries and DataSet: "World Population by Year"

In [1]:
import pandas as pd
import numpy as np
from numpy import nan
import re
import sys


import os
from dotenv import load_dotenv

import requests

from bs4 import BeautifulSoup

In [2]:
sys.path.append('/Users/awalsh/Ironhack/PROYECTOS/W3-pipelines-project/src')
import cleaning_functions as cf

In [3]:
!pwd

/Users/awalsh/IRONHACK/PROYECTOS/W3-pipelines-project/Notebooks


In [4]:
# We import the DataSet "World Population by Year", downloaded from kaggle. 
df_population = pd.read_csv('../Data/WorldPopulation.csv') 

In [5]:
#We explore de DataSet to check its columns, NaNs and shape. 
df_population.keys()

Index(['Year', 'Population', 'ChangePerc', 'NetChange', 'Density', 'Urban',
       'UrbanPerc'],
      dtype='object')

In [6]:
df_population.head()

Unnamed: 0,Year,Population,ChangePerc,NetChange,Density,Urban,UrbanPerc
0,2020,7794798739,1.05,81330639,52,4378993944,56
1,2019,7713468100,1.08,82377060,52,4299438618,56
2,2018,7631091040,1.1,83232115,51,4219817318,55
3,2017,7547858925,1.12,83836876,51,4140188594,55
4,2016,7464022049,1.14,84224910,50,4060652683,54


In [7]:
df_population['Year']

0     2020
1     2019
2     2018
3     2017
4     2016
      ... 
65    1955
66    1954
67    1953
68    1952
69    1951
Name: Year, Length: 70, dtype: int64

In [8]:
df_population.isna().sum()

Year          0
Population    0
ChangePerc    0
NetChange     0
Density       0
Urban         0
UrbanPerc     0
dtype: int64

In [9]:
#We delete the columns we don't need for the hypothesis, and we are left with "Year" and "Population."
df_population.drop(["ChangePerc", "NetChange", "Density", "Urban",
       "UrbanPerc"], axis=1, inplace=True)

In [10]:
df_population

Unnamed: 0,Year,Population
0,2020,7794798739
1,2019,7713468100
2,2018,7631091040
3,2017,7547858925
4,2016,7464022049
...,...,...
65,1955,2773019936
66,1954,2724846741
67,1953,2677608960
68,1952,2630861562


In [11]:
# We set the index in the column "Year".
df_population = df_population.set_index("Year")
df_population.head()

Unnamed: 0_level_0,Population
Year,Unnamed: 1_level_1
2020,7794798739
2019,7713468100
2018,7631091040
2017,7547858925
2016,7464022049


## 2. Web Sraping: "Ocean Temperature" and final df

We definde functions to: 
    a) get the "Ocean Temperature by Decade" from an URL
    b) build a dictionary from the df_population
    c) calculate the average population for a decade in the df_population
    d) build a dictionary with the values: Temperature, Population and Year(in decades) for each Key, theese being the index of the DataFrame.

URL = "https://www.currentresults.com/Environment-Facts/changes-in-earth-temperature.php"
page = requests.get(URL)
results = BeautifulSoup(page.content, "html.parser")
table = results.find('table', class_='articletable tablecol-1-left revcolr')
print(table)

#function to build dictionary "Decade: ocean temperature" from web scraping.
def getAverageOceanTemperatureByDecade():
    URL = "https://www.currentresults.com/Environment-Facts/changes-in-earth-temperature.php"
    page = requests.get(URL)
    results = BeautifulSoup(page.content, "html.parser")
    table = results.find('table', class_='articletable tablecol-1-left revcolr')

    a = {}

    for row in table.tbody.find_all('tr'):
        columns = row.find_all('td')
        a[int(columns[0].text.strip()[:-1])] = columns[1].text.strip()

    return a

#function to build dictionary of population each 10 years. 
def buildDictionaryOfPopulationFrom(decadeStart, decadeEnd, df):
    d = {}
    for i in range(decadeStart, decadeEnd, 10):
        d[i] = average_on_decade(df, i)
    return d


#function to calculate the average of population in each decade.
def average_on_decade(df, decade):
    total_items = 10
    count_total = 0
    for i in range(decade, decade + 9):
        try:
            count_total += df.loc[i].Population
        except KeyError:
            total_items -= 1

    return count_total / total_items

#function to build dictionary with values of average of year (in decades), average population and temperature.  
def buildCompleteDataframe(temperatura: dict, poblacion: dict, since, till):
    dictionary = {}
    index_counter = 0

    for i in range(since, till, 10):
        
        list_of_temp_population = []
        
        list_of_temp_population.append(i)
        
        if temperatura.get(i) is not None:
            list_of_temp_population.append(temperatura.get(i))
        else:
            list_of_temp_population.append("0")
            
            
        if poblacion.get(i) is not None:
            list_of_temp_population.append(poblacion.get(i))
        else:
            list_of_temp_population.append("0")
            
        dictionary[index_counter] = list_of_temp_population
        
        index_counter+=1 

        
    return pd.DataFrame.from_dict(dictionary, orient='index')


We apply al the functions to obtain a final_df.

In [12]:

final_df = cf.buildCompleteDataframe(cf.getAverageOceanTemperatureByDecade(), cf.buildDictionaryOfPopulationFrom(1950, 2020, df_population), 1950, 2010)


In [13]:
display(final_df)

Unnamed: 0,0,1,2
0,1950,13.98,2445756000.0
1,1960,13.99,2954007000.0
2,1970,14.0,3602761000.0
3,1980,14.18,4312541000.0
4,1990,14.31,5094413000.0
5,2000,14.51,5817070000.0


We change the names of the df columns, and we are left with a df that shows the temperature of the ocean and the population in each decade. 

In [14]:
final_df.columns=['Year','Temperature','Population']

In [15]:
display(final_df)

Unnamed: 0,Year,Temperature,Population
0,1950,13.98,2445756000.0
1,1960,13.99,2954007000.0
2,1970,14.0,3602761000.0
3,1980,14.18,4312541000.0
4,1990,14.31,5094413000.0
5,2000,14.51,5817070000.0


In [16]:
#We use ".info()" to explore the df and to know the Dtype of the column "Population".
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         6 non-null      int64  
 1   Temperature  6 non-null      object 
 2   Population   6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 192.0+ bytes


In [17]:
#We use ".astype" to convert the type of "Population" to int.
final_df = final_df.astype({"Population":int})

In [18]:
display(final_df)

Unnamed: 0,Year,Temperature,Population
0,1950,13.98,2445756393
1,1960,13.99,2954006871
2,1970,14.0,3602760834
3,1980,14.18,4312540555
4,1990,14.31,5094413294
5,2000,14.51,5817069827


In [19]:
df_population.to_csv('../Data/clean_WorldPopulation.csv')

In [20]:
final_df.to_csv('../Data/final_df.csv')

# NIVELES CO2


In [21]:
df_co2 = pd.read_csv('../Data/monthly-atm-co2.csv') 

In [22]:
display(df_co2)

Unnamed: 0,Entity,Code,Day,average_co2_concentrations,trend_co2_concentrations
0,World,OWID_WRL,1980-01-15,338.55,337.93
1,World,OWID_WRL,1980-02-15,339.27,338.22
2,World,OWID_WRL,1980-03-15,339.60,338.25
3,World,OWID_WRL,1980-04-15,340.00,338.37
4,World,OWID_WRL,1980-05-15,340.43,338.90
...,...,...,...,...,...
494,World,OWID_WRL,2021-03-15,415.61,413.99
495,World,OWID_WRL,2021-04-15,415.93,414.12
496,World,OWID_WRL,2021-05-15,416.12,414.50
497,World,OWID_WRL,2021-06-15,415.34,414.77


In [23]:
df_co2.keys()

Index(['Entity', 'Code', 'Day', 'average_co2_concentrations',
       'trend_co2_concentrations'],
      dtype='object')

In [24]:
df_co2.isna().sum()

Entity                        0
Code                          0
Day                           0
average_co2_concentrations    0
trend_co2_concentrations      0
dtype: int64

In [25]:
list(df_co2['Entity'].unique())

['World']

In [26]:
list(df_co2['Day'].unique())

['1980-01-15',
 '1980-02-15',
 '1980-03-15',
 '1980-04-15',
 '1980-05-15',
 '1980-06-15',
 '1980-07-15',
 '1980-08-15',
 '1980-09-15',
 '1980-10-15',
 '1980-11-15',
 '1980-12-15',
 '1981-01-15',
 '1981-02-15',
 '1981-03-15',
 '1981-04-15',
 '1981-05-15',
 '1981-06-15',
 '1981-07-15',
 '1981-08-15',
 '1981-09-15',
 '1981-10-15',
 '1981-11-15',
 '1981-12-15',
 '1982-01-15',
 '1982-02-15',
 '1982-03-15',
 '1982-04-15',
 '1982-05-15',
 '1982-06-15',
 '1982-07-15',
 '1982-08-15',
 '1982-09-15',
 '1982-10-15',
 '1982-11-15',
 '1982-12-15',
 '1983-01-15',
 '1983-02-15',
 '1983-03-15',
 '1983-04-15',
 '1983-05-15',
 '1983-06-15',
 '1983-07-15',
 '1983-08-15',
 '1983-09-15',
 '1983-10-15',
 '1983-11-15',
 '1983-12-15',
 '1984-01-15',
 '1984-02-15',
 '1984-03-15',
 '1984-04-15',
 '1984-05-15',
 '1984-06-15',
 '1984-07-15',
 '1984-08-15',
 '1984-09-15',
 '1984-10-15',
 '1984-11-15',
 '1984-12-15',
 '1985-01-15',
 '1985-02-15',
 '1985-03-15',
 '1985-04-15',
 '1985-05-15',
 '1985-06-15',
 '1985-07-