# OWID Exploratory Data Analysis

CO2 and Greenhouse Gas Emissions Data by Our World in Data: https://github.com/owid/co2-data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import altair as alt
import math

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
df = pd.read_csv('dataset/owid/owid-co2-data.csv')
codebook = pd.read_csv('dataset/owid/owid-co2-codebook.csv')

In [None]:
df

The codebook provides description and the source for each variable in the dataset.

In [None]:
codebook.head()

## Dataset Overview

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# change 'year' to string object
df = df.astype({'year': str})
print(df['year'].dtypes)

## Missing value identification

In [None]:
# function to count missing value
def get_nan(df, sort=False):
    missing_value = df[df.isna().values.any(axis=1)]
    rows = missing_value.shape[0]
    print(rows, "rows with missing values")
    
    if rows < 0:
        return 0
    if sort:
        print(df.isna().sum().sort_values())
    else:
        print(df.isna().sum())
        

In [None]:
get_nan(df, sort=True)

In [None]:
# function to visualize the proportion of missing value
def visual_nan(df, name="", isSorted=False, savePlot=True):
    
    sort = ''
    if isSorted:
        df=df[df.isna().sum().sort_values().keys()]
        sort = '_sorted'
    missing_value = df.isna().melt(value_name="missing")

    ax = sns.displot(data=missing_value,
                     y="variable",
                     hue="missing",
                     multiple="fill",
                     height=10,
                     aspect=1.5)
    plt.xlabel("Proportion of missing value")
    plt.title(f"Missing values of {name}")
    
    if savePlot:
        filename = name.split('.')
        plt.savefig(f"plot/{filename[0]}{sort}.png")

    return ax

In [None]:
visual_nan(df, 'owid-co2-data.csv', isSorted=True)

In [None]:
missing = pd.DataFrame(df.isna().sum(), columns=['missing'])
missing.reset_index(inplace=True)
missing.rename(columns={'index':'col'}, inplace=True)
missing['ratio'] = (missing['missing']/len(df)*100).round(2)
missing_list = missing[missing['ratio'] > 70]['col'].tolist()
missing_list

In [None]:
# function to display the number of missing values and missing ratio
def missing_ratio_summary(df):
    missing = pd.DataFrame(df.isna().sum(), columns=['missing'])
    missing.reset_index(inplace=True)
    missing.rename(columns={'index':'col'}, inplace=True)
    missing['ratio'] = (missing['missing']/len(df)*100).round(2)

    display(missing.sort_values(by=['ratio']))
    
# function to return a list of columns with more than x% of missing values 
def get_X_missing(df, X):
    missing = pd.DataFrame(df.isna().sum(), columns=['missing'])
    missing.reset_index(inplace=True)
    missing.rename(columns={'index':'col'}, inplace=True)
    missing['ratio'] = (missing['missing']/len(df)*100).round(2)
    
    missing_list = missing[missing['ratio'] > X]['col'].tolist()
    return missing_list
    
missing_ratio_summary(df)

... something to do ..

## Outlier detection

In [None]:
# function to plot boxplot
def plot_boxplot(series, title='', xlabel=''):
    bp = sns.boxplot(x=series)
    bp.set(title=title,
           xlabel=xlabel)
    return bp

# function to plot boxplot for all numeric columns
def check_outlier(df):
    numeric_columns = df.describe().columns.copy()
    num_of_columns = len(numeric_columns)
    
    for i in range(math.ceil(num_of_columns/3)):
        c = 3*i
        
        plt.figure(figsize=(20,2))
        
        plt.subplot(1, 3, 1)
        plot_boxplot(df[numeric_columns[c]], xlabel=numeric_columns[c])
        
        plt.subplot(1, 3, 2)
        try:
            plot_boxplot(df[numeric_columns[c+1]], xlabel=numeric_columns[c+1])
        except IndexError:
            plt.xticks([]), plt.yticks([])
            plt.axis("off")
            break
        
        plt.subplot(1, 3, 3)
        try:
            plot_boxplot(df[numeric_columns[c+1]], xlabel=numeric_columns[c+2])
        except IndexError:
            plt.xticks([]), plt.yticks([])
            plt.axis("off")
            break
        plt.show()

        
check_outlier(df)

In [None]:
# function to plot an interactive boxplot 
def check_outlier_px(df, col_name):    
    try:
        fig = px.box(df, y=col_name)
        fig.show()
    except Exception as ex:
        print('Something went wrong')

In [None]:
check_outlier_px(df, 'year')

## Categorical data

In [None]:
unique_iso_codes = df.iso_code.unique()
unique_countries = df.country.unique()
unique_years = df.year.unique()

In [None]:
print(unique_iso_codes)
print(len(unique_iso_codes))

In [None]:
print(unique_countries)
print(len(unique_countries))

In [None]:
unique_years.sort()
print(unique_years)
print(len(unique_years))

In [None]:
# plt.figure(figsize=(30, 3))
# sns.countplot(x="iso_code", data=df, palette='rocket')

In [None]:
px.histogram(df, x="iso_code")

In [None]:
px.histogram(df, x="country")

In [None]:
px.histogram(df.sort_values(by=['year']), x="year")

## Top countries that produce the most Carbon Dioxide (CO2)

China, US, India

In [None]:
# function to slice out country from the master df
def get_country_data(df, country_name):
    df_country = df[df['country'] == f'{country_name}']
    df_country.reset_index(inplace=True, drop=True)
    return df_country

#### China

In [None]:
df_china_raw = get_country_data(df, 'China')
print(df_china_raw.shape)

In [None]:
visual_nan(df_china_raw, name='owid-China', isSorted=True)

In [None]:
missing_ratio_summary(df_china_raw)

### Handling Missing Values

| Percentage of Missing values | Action                                           |
| :--------------------------- | :----------------------------------------------- |
| [ 0%,  70%]                  | Multivariate imputation                          |
| (70%, 100%]                  | Drop column                                      |
  
Columns with *more than* 70% of missing values are dropped. 

Use `get_X_missing(df, X)` function to get a list of columns that applies.

In [None]:
col_to_drop_china = get_X_missing(df_china_raw, 70) # columns that have more than 70% missing values
print(len(col_to_drop_china))
print(col_to_drop_china)

In [None]:
df_china = df_china_raw.drop(col_to_drop_china, axis=1)
df_china

Doing the same to **United States** and **India**.

#### United States

In [None]:
df_us_raw = get_country_data(df, 'United States')
print(df_us_raw.shape)
visual_nan(df_us_raw, name='owid-United-States', isSorted=True)

In [None]:
col_to_drop_us = get_X_missing(df_us_raw, 70) # columns that have more than 70% missing values
print(len(col_to_drop_us))
print(f"A list of columns to drop: \n{col_to_drop_us}")

df_us = df_us_raw.drop(col_to_drop_us, axis=1)
df_us

#### India

In [None]:
df_india_raw = get_country_data(df, 'India')
print(df_india_raw.shape)
visual_nan(df_india_raw, name='owid-India', isSorted=True)

In [None]:
col_to_drop_india = get_X_missing(df_india_raw, 70) # columns that have more than 70% missing values
print(len(col_to_drop_india))
print(f"A list of columns to drop: \n{col_to_drop_india}")

df_india = df_india_raw.drop(col_to_drop_india, axis=1)
df_india

### Intersecting Columns

Finding the intersecting columns of `df_china`, `df_us`, and `df_india`.

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8d/Intersection_of_3_circles_7.svg/1073px-Intersection_of_3_circles_7.svg.png" width=20%>

In [None]:
print(df_china.shape)
col_china = df_china.columns.tolist()
print(col_china)

print(df_us.shape)
col_us = df_us.columns.tolist()
print(col_us)

print(df_india.shape)
col_india = df_india.columns.tolist()
print(col_india)

In [None]:
china_us = np.intersect1d(col_china, col_us)
china_us_india = np.intersect1d(china_us, col_india)
print(len(china_us_india))
print(china_us_india)

In [None]:
df_china[china_us_india]

In [None]:
df_us[china_us_india]

In [None]:
df_india[china_us_india]

### Multivariate Imputation

In [None]:
df_china

In [None]:
122*0.8

In [None]:
df_china_train = df_china[:100]
df_china_train

In [None]:
df_china_test = df_china[-22:]
df_china_test

In [None]:
df_china_test['year'].to_list()

In [None]:
imp = IterativeImputer(max_iter=10, random_state=2022)
imp.fit(df_china_train.iloc[:,3:])

df_china_predicted_array = imp.transform(df_china_test.iloc[:,3:])

In [None]:
df_china_predicted_array

In [None]:
columns=df_china_test.columns.tolist()[3:]
print(len(columns))
print(columns)

In [None]:
df_china_predicted = pd.DataFrame(df_china_predicted_array, columns=columns)
df_china_predicted

In [None]:
df_china_predicted['iso_code'] = df_china['iso_code']
df_china_predicted['country'] = df_china['country']
df_china_predicted['year'] = df_china_test['year'].to_list() # predicted years

df_china_predicted.reindex(columns=df_china_test.columns.tolist())

In [None]:
missing_ratio_summary(df_china_predicted)

No missing value✌️

In [None]:
df_us = get_country_data(df, 'United States')
df_us

In [None]:
print(len(df_us))
df_us.isnull().any(axis = 1).sum()

In [None]:
df_india = get_country_data(df, 'India')
df_india

In [None]:
print(len(df_india))
df_india.isnull().any(axis = 1).sum()