In [1]:
import numpy as np
import pandas as pd
import math
from functools import reduce


Data source:
###  [WHO report on the global tobacco epidemic 2019](https://www.who.int/tobacco/global_report/en/)


#TODO link to tobacco use data [Tobacco Use By Country]()

#TODO link to [Happiness]()

In [2]:
def generic_clean(df_raw, 
                  columns, 
                  names_dict=None,
                  drop_indices=None,
                  apply_func_cols=None,
                  save_path=None):
    '''
    Parameters
    ----------
    df_raw: pandas.DataFrame
        the raw dataframe input
    
    columns: list
        list of columns to keep
    
    names_dict: dict
        dictionary of old_name:new_name to rename column names
        
    drop_indices: list
        list of row indices to be deleted
    
    func_cols: list
        list of (func, cols)
        applies the func to the list of columns in cols
    
    map_func: funtion
        the function to be use for dataframe map 

    save_path: str
        path to store the final df
    
    Returns
    -------
    pandas.DataFrame
            Processed dataframe
    '''
    
    df = df_raw[columns]
    if names_dict != None:
        df = df.rename(columns=names_dict)

    if drop_indices != None:
            df.drop(drop_indices, inplace=True)
    df = df.dropna()
    
    if apply_func_cols != None:
        for apply_func_col in apply_func_cols:            
            func, cols = apply_func_col
            for col in cols:
                df[col] = df[col].apply(func)           
    df.reset_index(inplace=True, drop=True)

    if save_path != None:
        df.to_csv(save_path, index=False)

    return df

In [3]:
country_str_strip = (lambda x: x.strip(), ['Country'])

## Tobac_use cleaning

In [4]:
def split_row(row):
    if isinstance(row, str):
        return float(row.split()[0])
    else:
        return row

In [5]:
tobac_use_raw = pd.read_csv('data/raw/tobac_use_by_country.csv')
tobac_use_cols = list(tobac_use_raw.columns[1:])
tobac_use_names = {"Male": "Tobac_Use_M", "Female": "Tobac_Use_F"}
use_func_cols = [(split_row, ["Tobac_Use_M", "Tobac_Use_F"]), country_str_strip]
drop_indices_use = list(tobac_use_raw[tobac_use_raw['Year']!=2015].index)

In [6]:
tobac_use = generic_clean(df_raw=tobac_use_raw,
             columns=tobac_use_cols,
             names_dict=tobac_use_names,
             drop_indices=drop_indices_use,
             apply_func_cols=use_func_cols,
             save_path='data/clean/tobac_use_by_country.csv')

In [7]:
tobac_use.head()

Unnamed: 0,Country,Year,Tobac_Use_M,Tobac_Use_F
0,Albania,2015,51.2,7.6
1,Andorra,2015,37.2,27.8
2,Argentina,2015,29.5,18.4
3,Armenia,2015,52.3,1.5
4,Australia,2015,16.7,13.1


# Tobacco Tax

In [8]:
def extract_precent(row):
    if isinstance(row, str):
        return float(row[:-1])
    else:
        return row

In [9]:
tax_raw = pd.read_csv('data/raw/Taxes-and-retail-price-for-a-pack-of-20-cigarette-most-sold-brand.csv')
tax_cols = ['2016', '2014', 'COUNTRY']
not_avail = tax_raw[tax_raw[tax_cols].eq('. . .').any(axis=1)]
no_data = tax_raw[tax_raw[tax_cols].eq('—').any(axis=1)]
drop_indices_tax = list(not_avail.index)
drop_indices_tax.extend(list(no_data.index))
tax_col_names = {'COUNTRY': 'Country'}
tax_func_cols = [country_str_strip, (extract_precent, tax_cols[:-1])]

In [10]:
tax_data = generic_clean(df_raw=tax_raw,
                          columns=tax_cols,
                          drop_indices=drop_indices_tax,
                          apply_func_cols=tax_func_cols,
                          names_dict=tax_col_names)

In [11]:
tax_data['Tax_2015'] = (tax_data['2016'] + tax_data['2014'])/2
tobac_tax = tax_data[['Country', 'Tax_2015']]

In [12]:
tobac_tax.to_csv("data/clean/tobac_tax_2015.csv", index=False)

In [13]:
tobac_tax.head()

Unnamed: 0,Country,Tax_2015
0,Afghanistan,3.425
1,Albania,65.195
2,Algeria,36.135
3,Andorra,71.35
4,Antigua and Barbuda,15.05


## Happiness

In [14]:
happiness_raw = pd.read_csv('data/raw/happiness/world-happiness/2015.csv')
happiness_cols = ['Country', 'Happiness Score']
happiness_names = {"Happiness Score": "Happiness_Score"}
happiness_func_cols = [country_str_strip]

In [15]:
happiness_data = generic_clean(df_raw=happiness_raw, 
                         columns=happiness_cols, 
                         names_dict=happiness_names,
                         apply_func_cols=happiness_func_cols,
                         save_path="data/clean/happiness_2015.csv")

In [16]:
happiness_data.head()

Unnamed: 0,Country,Happiness_Score
0,Switzerland,7.587
1,Iceland,7.561
2,Denmark,7.527
3,Norway,7.522
4,Canada,7.427


## Affordability

In [17]:
afford_raw = pd.read_csv('data/raw/Affordability.csv')
afford_raw.head()

Unnamed: 0,COUNTRY,2008,2010,2012,2014,2016,2018,TREND GROWTH RATE IN AFFORDABILITY,Unnamed: 8,CIGARETTES LESS AFFORDABLE SINCE 2008 +++,CIGARETTES LESS AFFORDABLE IN 2018 THAN IN 2016,Unnamed: 11
0,Afghanistan,,4.48%,4.05%,4.45%,5.54%,7.89%,7.44%,6.79%,**,Yes,No*
1,Albania,,3.55%,3.52%,3.70%,4.14%,4.48%,4.17%,2.35%,**,Yes,No*
2,Algeria,,2.19%,2.55%,1.97%,3.41%,4.88%,5.99%,10.74%,**,Yes,Yes
3,Andorra,,. . .,0.80%,1.04%,1.10%,1.04%,. . .,. . .,,. . .,. . .
4,Angola,,1.63%,. . .,3.00%,3.62%,. . .,4.66%,10.44%,**,Yes,. . .


In [18]:
afford_cols = ['2016', '2014', 'COUNTRY']
not_avail = afford_raw[afford_raw[afford_cols].eq('. . .').any(axis=1)]
no_data = afford_raw[afford_raw[afford_cols].eq('—').any(axis=1)]
drop_indices_afford = list(not_avail.index)
drop_indices_afford.extend(list(no_data.index))
afford_col_names = {'COUNTRY': 'Country'}
afford_func_cols = [country_str_strip, (extract_precent, afford_cols[:-1])]

In [19]:
afford_data = generic_clean(df_raw=afford_raw,
                         columns=afford_cols,
                         names_dict=afford_col_names,
                         drop_indices=drop_indices_afford,
                         apply_func_cols=afford_func_cols)
afford_data['Afford_2015'] = (afford_data['2016'] + afford_data['2014'])/2
afford_data = afford_data[['Country', 'Afford_2015']]

In [20]:
afford_data.to_csv("data/clean/affordability_2015.csv", index=False)

In [21]:
afford_data.head()

Unnamed: 0,Country,Afford_2015
0,Afghanistan,4.995
1,Albania,3.92
2,Algeria,2.69
3,Andorra,1.07
4,Angola,3.31


## Nominal Data

In [22]:
ads_ban_dir = pd.read_csv('data/clean/Bans-on-direct-advertising.csv')
ads_ban_indir = pd.read_csv('data/clean/Bans-on-indirect-advertising.csv')
add_ads_ban_indir = pd.read_csv('data/clean/Additional-bans-on-indirect-advertising.csv')
health_warn = pd.read_csv('data/clean/Characteristics-of-health-warnings-on-cigarette-packages.csv')
smokefree_places = pd.read_csv('data/clean/Public-places-with-smoke-free-legislation.csv')

## Merging different datasets based on country
    
    The name of countries are not the same over different datasets.
    
    - Preparing a list of countries with same names
    


### Unifiying countries name

In [23]:
happiness_map_countries = {"Bolivia": "Bolivia (Plurinational State of)", 
                     "Bosnia & Herzegovina":"Bosnia and Herzegovina",
                    "Brunei": "Brunei Darussalam",
                    "Cape Verde": "Cabo Verde",
                    "Czech Republic": "Czechia",
                    "Iran": "Iran (Islamic Republic of)",
                    "United Kingdom": "United Kingdom of Great Britain and Northern Ireland",
                    "United States": "United States of America",
                    "Vietnam": "Viet Nam"}

# cost_map_countries = {"Democratic Republic of the Congo":"Congo",}

In [24]:
happiness_data = happiness_data.replace(happiness_map_countries)
# tobac_cost[tobac_cost['Country']=="Democratic Republic of the Congo"]

In [25]:
use_countries = tobac_use['Country'].unique()
tax_countries = tobac_tax['Country'].unique()
happiness_countries = happiness_data['Country'].unique()
afford_countries = afford_data['Country'].unique()
ads_dir_countries = ads_ban_dir['Country'].unique()
ads_indr_countries = ads_ban_indir['Country'].unique()
ads_add_countries = add_ads_ban_indir['Country'].unique()
health_countries = health_warn['Country'].unique()
smokefree_countries = smokefree_places['Country'].unique()

print(f"Number of countries in tobacco use table: {len(use_countries)}")
print(f"Number of countries in tobacco tax table: {len(tax_countries)}")
print(f"Number of countries in happiness table: {len(happiness_countries)}")
print(f"Number of countries in afford table: {len(afford_countries)}")
print(f"Number of countries in ads_dir table: {len(ads_dir_countries)}")
print(f"Number of countries in health warn table: {len(health_countries)}")
print(f"Number of countries in smokefree places table: {len(smokefree_countries)}")

Number of countries in tobacco use table: 127
Number of countries in tobacco tax table: 182
Number of countries in happiness table: 158
Number of countries in afford table: 181
Number of countries in ads_dir table: 195
Number of countries in health warn table: 194
Number of countries in smokefree places table: 195


In [26]:

same_countries = reduce(np.intersect1d, (use_countries,
                                         tax_countries,
                                         happiness_countries, 
                                         afford_countries,
                                         ads_dir_countries, 
                                         ads_indr_countries, 
                                         ads_indr_countries, 
                                         health_countries, 
                                         smokefree_countries)) 
print(f"Number of countries with same names: {len(same_countries)}")

Number of countries with same names: 103


In [27]:
same_countries

array(['Albania', 'Argentina', 'Armenia', 'Australia', 'Austria',
       'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium',
       'Benin', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada', 'Chile', 'China',
       'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Czechia',
       'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia',
       'Ethiopia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana',
       'Greece', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia',
       'Iran (Islamic Republic of)', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kyrgyzstan',
       'Latvia', 'Lebanon', 'Liberia', 'Lithuania', 'Luxembourg',
       'Malaysia', 'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Mexico',
       'Mongolia', 'Morocco', 'Mozambique', 'Myanmar', 'Nepal',
       'Netherlands', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan',
       'Panama', 'Paraguay', '

In [28]:
df_list = [tobac_use, 
           tobac_tax, 
           happiness_data, 
           afford_data, 
           ads_ban_dir, 
           ads_ban_indir, 
           add_ads_ban_indir, 
           health_warn, 
           smokefree_places]

In [31]:
tobacco_data = reduce(lambda  left,right: pd.merge(left,right,on=['Country']), df_list)

In [32]:
tobacco_data.head()

Unnamed: 0,Country,Year,Tobac_Use_M,Tobac_Use_F,Tax_2015,Happiness_Score,Afford_2015,Ban_Score_Dir_Ads,Ban_Score_Indr_Ads,Ban_Score_add_indir_ads,Warn_Score,Ban_Score_places
0,Albania,2015,51.2,7.6,65.195,4.959,3.92,8,8,3,50,8
1,Argentina,2015,29.5,18.4,75.045,6.574,1.31,7,10,5,50,8
2,Armenia,2015,52.3,1.5,34.165,4.35,3.945,5,2,0,50,3
3,Australia,2015,16.7,13.1,58.515,7.284,2.285,6,2,0,83,6
4,Austria,2015,35.5,34.8,74.835,7.2,1.225,7,8,5,65,2


In [33]:
tobacco_data.to_csv("data/tobacco_data.csv", index=False)