# Data Cleaning

# Imports

In [1]:
# Python specific
import numpy as np
import pandas as pd
import plotly.express as px
# Python interface to R
import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
from functools import partial
from rpy2.ipython import html

# Notebook settings

In [2]:
%load_ext rpy2.ipython
html.html_rdataframe=partial(html.html_rdataframe, table_class="docutils")
rpy2.ipython.html.init_printing()
pandas2ri.activate()

# R Packages

In [3]:
importr('utils')
importr('vroom')
importr('tidyverse')

rpy2.robjects.packages.Package as a <module 'tidyverse'>

# Load Data

In [4]:
%%R
df.temp = vroom("Data/GlobalLandTemperaturesByCity.csv", progress = F)
df.carb = vroom("Data/co2_emission.csv", progress = F)
df.atm.carb = vroom("Data/atm_co.csv", progress = F)
colnames(df.carb)[4] = "CO2_emissions"

R[write to console]: [1mRows:[22m 8,599,212
[1mColumns:[22m 7
[1mDelimiter:[22m ","
[31mchr[39m  [4]: City, Country, Latitude, Longitude
[32mdbl[39m  [2]: AverageTemperature, AverageTemperatureUncertainty
[34mdate[39m [1]: dt

[90mUse `spec()` to retrieve the guessed column specification[39m
[90mPass a specification to the `col_types` argument to quiet this message[39m

R[write to console]: [1mRows:[22m 20,853
[1mColumns:[22m 4
[1mDelimiter:[22m ","
[31mchr[39m [2]: Entity, Code
[32mdbl[39m [2]: Year, Annual CO₂ emissions (tonnes )

[90mUse `spec()` to retrieve the guessed column specification[39m
[90mPass a specification to the `col_types` argument to quiet this message[39m

R[write to console]: [1mRows:[22m 720
[1mColumns:[22m 7
[1mDelimiter:[22m ","
[32mdbl[39m [7]: Year, Month, Decimal Date, Carbon Dioxide (ppm), Seasonally Adjusted CO2 (ppm),...

[90mUse `spec()` to retrieve the guessed column specification[39m
[90mPass a specification to t

In [5]:
%R head(df.temp)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
1,-82606.0,6.068,1.737,Århus,Denmark,57.05N,10.33E
2,-82576.0,,,Århus,Denmark,57.05N,10.33E
3,-82545.0,,,Århus,Denmark,57.05N,10.33E
4,-82514.0,,,Århus,Denmark,57.05N,10.33E
5,-82485.0,,,Århus,Denmark,57.05N,10.33E
6,-82454.0,5.788,3.624,Århus,Denmark,57.05N,10.33E


In [6]:
%R head(df.carb)

Unnamed: 0,Entity,Code,Year,CO2_emissions
1,Afghanistan,AFG,1949.0,14656.0
2,Afghanistan,AFG,1950.0,84272.0
3,Afghanistan,AFG,1951.0,91600.0
4,Afghanistan,AFG,1952.0,91600.0
5,Afghanistan,AFG,1953.0,106256.0
6,Afghanistan,AFG,1954.0,106256.0


In [7]:
%R head(df.atm.carb)

Unnamed: 0,Year,Month,Decimal Date,Carbon Dioxide (ppm),Seasonally Adjusted CO2 (ppm),Carbon Dioxide Fit (ppm),Seasonally Adjusted CO2 Fit (ppm)
1,1958.0,1.0,1958.0411,,,,
2,1958.0,2.0,1958.126,,,,
3,1958.0,3.0,1958.2027,315.69,314.42,316.18,314.89
4,1958.0,4.0,1958.2877,317.45,315.15,317.3,314.98
5,1958.0,5.0,1958.3699,317.5,314.73,317.83,315.06
6,1958.0,6.0,1958.4548,,,317.22,315.14


# Cleaning

In [8]:
%%R
# Sum up the atmospheric CO2 by year to get emissions per year
df.atm.carb %<>%
    group_by(Year) %>%
    summarise(atm_CO2 = sum(`Carbon Dioxide (ppm)`, na.rm=T), .groups="drop_last")

In [13]:
%%R

df.temp %<>% 
  mutate(
    date = as.Date(dt, "%Y-%m-%d"),
    year =  format(date, '%Y'),
    month = as.factor(format(date, "%b")),
    day = format(date, "%d")
    ) %>% 
    filter(!is.na(AverageTemperature)) %>% # Remove missing values
    group_by(year, City) %>% 
    arrange(City) %>% 
    complete(month) %>% # Generate rows for missing months by adding NAs for those rows
    group_by(year, City) %>% 
    filter(!sum(is.na(AverageTemperature)>0)) %>%  # Remove any Cities that don't have complete 12 month data
    ungroup() %>% 
    filter(month == "Dec") %>% # Select a specific month
    group_by(year, City, Country, Latitude, Longitude) %>% 
    summarise(AvgTemp_year = mean(AverageTemperature), .groups="drop_last") %>% # Average the Average temperature into yearly avg temp
    ungroup() %>% 
    mutate( # Change South West indicators to negative positive
    sign_long = case_when(str_detect(Longitude, "W")~-1, T~1),
    sign_lat = case_when(str_detect(Latitude, "S")~-1, T~1),
    lat = sign_lat*as.numeric(str_remove(Latitude, "[A-Z]")), # Multiply the sign by the lat/long degree value
    long = sign_long*as.numeric(str_remove(Longitude, "[A-Z]"))
    ) %>% 
    select(c(year, City, Country, AvgTemp_year, lat, long))



In [14]:
%%R
# Join data sets together
df.atm.carb$Year = as.character(df.atm.carb$Year)
df = left_join(df.temp, df.atm.carb, by = c("year"="Year")) # Inner left join on year key to get yearly atmospheric CO2
df = na.omit(df) # Drop missing values
df.carb = df.carb[,-2] # Drop code column
df.carb$Year = as.character(df.carb$Year)
df = left_join(df, df.carb, by = c("Country" = "Entity", "year"="Year")) # Inner left join on country and year 
df = na.omit(df)
head(df)

  year      City Country AvgTemp_year   lat  long atm_CO2 CO2_emissions
1 1958  A Coruña   Spain        9.563 42.59 -8.73 2522.64      55894320
2 1958    Aachen Germany        3.467 50.63  6.34 2522.64     756473104
3 1958   Aalborg Denmark        1.396 57.05 10.33 2522.64      26794832
4 1958       Aba Nigeria       26.777  5.63  8.07 2522.64       3722624
5 1958    Abadan    Iran       14.991 29.74 48.00 2522.64      25230304
6 1958 Abakaliki Nigeria       26.777  5.63  8.07 2522.64       3722624


In [15]:
# Pull the dataframe into python
%Rpull df
df = r['df']

In [16]:
# Group countries into continents
gp = px.data.gapminder()
gp = gp[["country", "continent"]]
df = df.join(gp.set_index("country"), how='left', on = 'Country').drop_duplicates()
df = df.dropna()
df.head()

Unnamed: 0,year,City,Country,AvgTemp_year,lat,long,atm_CO2,CO2_emissions,continent
1,1958,A Coruña,Spain,9.563,42.59,-8.73,2522.64,55894320.0,Europe
2,1958,Aachen,Germany,3.467,50.63,6.34,2522.64,756473104.0,Europe
3,1958,Aalborg,Denmark,1.396,57.05,10.33,2522.64,26794832.0,Europe
4,1958,Aba,Nigeria,26.777,5.63,8.07,2522.64,3722624.0,Africa
5,1958,Abadan,Iran,14.991,29.74,48.0,2522.64,25230304.0,Asia


In [17]:
# Save cleaned data
df.to_csv("Data/cleaned_data.csv", index = False)