# Cleansing Nominal Data into Numeric

In [1]:
import numpy as np
import pandas as pd
import math
from functools import reduce

In [10]:
def nominal_clean(df_raw, 
                  columns, 
                  names_dict=None,
                  drop_indices=None,
                  save_path=None):
    '''
    Parameters
    ----------
    df_raw: pandas.DataFrame
        the raw dataframe input
    
    columns: list
        list of columns to calculate the ban score
    
    names_dict: dict
        dictionary of old_name:new_name to rename column names
        
    drop_indices: list
        list of row indices to be deleted
    
    save_path: str
        path to store the final df
    
    Returns
    -------
    pandas.DataFrame
            Processed dataframe
    '''
    df['Ban Score'] = df_raw[columns[1:]].eq("Yes").apply(sum, axis=1)
    
    df = df.rename(columns={'COUNTRY': 'Country'})
    df = df[['Country', 'Ban Score']]
    
    if drop_indices != None:
            df.drop(drop_indices, inplace=True)
    df = df.dropna()
                 
    df.reset_index(inplace=True, drop=True)
    df.reset_index(inplace=True)
    
    if save_path != None:
        df.to_csv(save_path, index=False)

    return df

### Clean Bans On Direct Advertising
The final scores show the measure of bans over advertising of tobacco in ascending order.

In [3]:
bans_dir = pd.read_csv('data/raw/Bans-on-direct-advertising.csv')

In [None]:
cols_selected = ['NATIONAL TV AND RADIO', 
        'INTERNATIONAL TV AND RADIO', 
        'LOCAL MAGAZINES AND NEWSPAPERS', 
        'INTERNATIONAL MAGAZINES AND NEWSPAPERS', 
        'BILLBOARD AND OUTDOOR ADVERTISING', 
        'POINT OF SALE', 
        'INTERNET', 
        'FINES FOR VIOLATIONS OF BANS ON DIRECT ADVERTISING']

nominal_clean(bans_dir)

In [4]:
bans_dir.head()

Unnamed: 0,COUNTRY,NATIONAL TV AND RADIO,INTERNATIONAL TV AND RADIO,LOCAL MAGAZINES AND NEWSPAPERS,INTERNATIONAL MAGAZINES AND NEWSPAPERS,BILLBOARD AND OUTDOOR ADVERTISING,POINT OF SALE,INTERNET,FINES FOR VIOLATIONS OF BANS ON DIRECT ADVERTISING,OVERALL COMPLIANCE OF BAN ON DIRECT ADVERTISING *
0,Afghanistan,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,8
1,Albania,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,10
2,Algeria,Yes,Yes,Yes,Yes,Yes,No,No,No,5
3,Andorra,No,No,No,No,No,No,No,No,—
4,Angola,No,No,No,No,No,No,No,No,—


In [5]:
bans_dir.drop(['OVERALL COMPLIANCE OF BAN ON DIRECT ADVERTISING *'], axis=1, inplace=True)

In [6]:
cols_selected = ['NATIONAL TV AND RADIO', 
        'INTERNATIONAL TV AND RADIO', 
        'LOCAL MAGAZINES AND NEWSPAPERS', 
        'INTERNATIONAL MAGAZINES AND NEWSPAPERS', 
        'BILLBOARD AND OUTDOOR ADVERTISING', 
        'POINT OF SALE', 
        'INTERNET', 
        'FINES FOR VIOLATIONS OF BANS ON DIRECT ADVERTISING']
bans_dir['Ban_Score'] = bans_dir[cols_selected].eq("Yes").apply(sum, axis=1)

In [7]:
cols = bans_dir.columns[[0,-1]]

In [8]:
bans_dir = bans_dir[cols]

In [None]:
df.rename(columns=names_dict)

In [9]:
bans_dir.head()

Unnamed: 0,COUNTRY,Ban_Score
0,Afghanistan,8
1,Albania,8
2,Algeria,5
3,Andorra,0
4,Angola,0


In [51]:
bans_dir.to_csv('data/clean/Bans-on-direct-advertising.csv')

### Clean Bans On Indirect Advertising
The final scores show the measure of bans over advertising of tobacco in ascending order.

In [52]:
bans_indir = pd.read_csv('data/raw/Bans-on-indirect-advertising.csv')

In [53]:
bans_indir.head()

Unnamed: 0,COUNTRY,FREE DISTRIBUTION IN MAIL OR THROUGH OTHER MEANS,PROMOTIONAL DISCOUNTS,NON-TOBACCO PRODUCTS IDENTIFIED WITH TOBACCO BRAND NAMES,BRAND NAME OF NON-TOBACCO PRODUCTS USED FOR TOBACCO PRODUCT,TOBACCO BRANDS (PRODUCT PLACEMENT),TOBACCO PRODUCTS,"PRESCRIBED ANTI-TOBACCO ADS REQUIRED FOR ANY VISUAL ENTERTAINMENT MEDIA PRODUCT THAT DEPICTS TOBACCO PRODUCTS, USE OR IMAGES",COMPLETE BAN ON SPONSORSHIP,"ANY FORM OF CONTRIBUTION (FINANCIAL OR OTHER SUPPORT) TO ANY EVENT, ACTIVITY OR INDIVIDUAL","BAN ON THE PUBLICITY OF FINANCIAL OR OTHER SPONSORSHIP OR SUPPORT BY THE TOBACCO INDUSTRY OF EVENTS, ACTIVITIES, INDIVIDUALS",FINES FOR VIOLATIONS OF BANS ON PROMOTION AND SPONSORSHIP,OVERALL COMPLIANCE OF BANS ON PROMOTION AND SPONSORSHIP *
0,Afghanistan,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,–,Yes,3
1,Albania,Yes,Yes,Yes,Yes,Yes,Yes,–,No,No,Yes,Yes,8
2,Algeria,No,No,No,No,No,No,No,No,No,No,–,—
3,Andorra,No,No,No,No,No,No,No,No,No,No,–,—
4,Angola,No,No,No,No,No,No,No,No,No,No,–,—


In [54]:
bans_indir.drop(['OVERALL COMPLIANCE OF BANS ON PROMOTION AND SPONSORSHIP *'], axis=1, inplace=True)

In [55]:
cols_selected = bans_indir.columns[1:]

In [56]:
cols_selected

Index(['FREE DISTRIBUTION IN MAIL OR THROUGH OTHER MEANS',
       'PROMOTIONAL DISCOUNTS',
       'NON-TOBACCO PRODUCTS IDENTIFIED WITH TOBACCO BRAND NAMES',
       'BRAND NAME OF NON-TOBACCO PRODUCTS USED FOR TOBACCO PRODUCT',
       'TOBACCO BRANDS (PRODUCT PLACEMENT)', 'TOBACCO PRODUCTS',
       'PRESCRIBED ANTI-TOBACCO ADS REQUIRED FOR ANY VISUAL ENTERTAINMENT MEDIA PRODUCT THAT DEPICTS TOBACCO PRODUCTS, USE OR IMAGES',
       'COMPLETE BAN ON SPONSORSHIP',
       'ANY FORM OF CONTRIBUTION (FINANCIAL OR OTHER SUPPORT) TO ANY EVENT, ACTIVITY OR INDIVIDUAL',
       'BAN ON THE PUBLICITY OF FINANCIAL OR OTHER SPONSORSHIP OR SUPPORT BY THE TOBACCO INDUSTRY OF EVENTS, ACTIVITIES, INDIVIDUALS',
       'FINES FOR VIOLATIONS OF BANS ON PROMOTION AND SPONSORSHIP'],
      dtype='object')

In [57]:
bans_indir['Ban_Score'] = bans_indir[cols_selected].eq("Yes").apply(sum, axis=1)

In [58]:
cols = bans_indir.columns[[0,-1]]

In [59]:
bans_dir = bans_indir[cols]

In [60]:
bans_dir.to_csv('data/clean/Bans-on-indirect-advertising.csv')


### Additional Bans on Indirect Advertising

In [61]:
bans_add = pd.read_csv('data/raw/Additional-bans-on-indirect-advertising.csv')

In [62]:
bans_add.head()

Unnamed: 0,COUNTRY,CSR ACTIVITIES,TOBACCO COMPANIES/TOBACCO INDUSTRY PUBLICIZING THEIR CSR ACTIVITIES,ENTITIES OTHER THAN TOBACCO COMPANIES/TOBACCO INDUSTRY PUBLICIZING THE CSR ACTIVITIES OF THE TOBACCO COMPANIES,TOBACCO COMPANIES FUNDING OR MAKING CONTRIBUTIONS TO SMOKING PREVENTION MEDIA CAMPAIGNS,LAW EXPLICITELY BANS TOBACCO PRODUCTS DISPLAY AT POINT OF SALE,LAW BANS TOBACCO VENDING MACHINES,LAW BANS INTERNET SALES OF TOBACCO PRODUCTS,"SUBNATIONAL BANS ON ADVERTISING, PROMOTION AND SPONSORSHIP EXIST§"
0,Afghanistan,No,No,No,No,No,No,No,—
1,Albania,No,Yes,Yes,No,No,Yes,No,—
2,Algeria,No,No,No,No,No,No,No,—
3,Andorra,No,No,No,No,No,No,No,—
4,Angola,No,No,No,No,No,No,No,—


In [63]:
bans_add.drop(['SUBNATIONAL BANS ON ADVERTISING, PROMOTION AND SPONSORSHIP EXIST§'], axis=1, inplace=True)

In [64]:
cols_selected = bans_add.columns[1:]

In [65]:
bans_add['Ban_Score'] = bans_add[cols_selected].eq("Yes").apply(sum, axis=1)

In [66]:
cols = bans_indir.columns[[0,-1]]
bans_dir = bans_indir[cols]

In [67]:
bans_dir.to_csv('data/clean/Additional-bans-on-indirect-advertising.csv')

### Health Warning on Cigarette Packages

In [68]:
health_warn = pd.read_csv('data/raw/Characteristics-of-health-warnings-on-cigarette-packages.csv')

In [69]:
health_warn.head()

Unnamed: 0,Country,HEALTH WARNINGS ARE MANDATED,AVERAGE OF FRONT AND REAR\n%,FRONT\n%,REAR\n%,DOES THE LAW MANDATE SPECIFIC HEALTH WARNINGS ON PACKAGES?,HOW MANY HEALTH WARNINGS ARE APPROVED BY THE LAW?,DO HEALTH WARNINGS APPEAR ON EACH PACKAGE AND ANY OUTSIDE PACKAGING AND LABELLING USED IN THE RETAIL SALE?,DO HEALTH WARNINGS DESCRIBE THE HARMFUL EFFECTS OF TOBACCO USE ON HEALTH?,"DOES THE LAW MANDATE FONT STYLE, FONT SIZE AND COLOUR OF HEALTH WARNINGS?",ARE THE HEALTH WARNINGS ROTATING?,ARE THE HEALTH WARNINGS WRITTEN IN THE PRINCIPAL LANGUAGE(S) OF THE COUNTRY?,DO THE HEALTH WARNINGS INCLUDE A PHOTOGRAPH OR GRAPHIC?
0,Afghanistan,Yes,—,—,—,No,—,Yes,No,No,No,No,No
1,Albania,Yes,50,50,50,Yes,16,Yes,Yes,Yes,Yes,Yes,No
2,Algeria,Yes,15,15,15,Yes,6,Yes,Yes,No,Yes,Yes,No
3,Andorra,No,—,—,—,—,—,—,—,—,—,—,—
4,Angola,No,—,—,—,—,—,—,—,—,—,—,—


In [70]:
mandate_index = health_warn['HEALTH WARNINGS ARE MANDATED'].eq('Yes')

0       True
1       True
2       True
3      False
4      False
       ...  
190     True
191     True
192     True
193     True
194     True

In [71]:
health_warn['warn_score'] = health_warn[mandate_index]['AVERAGE OF FRONT AND REAR\n%']

In [72]:
no_mandate = health_warn['HEALTH WARNINGS ARE MANDATED'].eq('No')
no_mandate

0      False
1      False
2      False
3       True
4       True
       ...  
190    False
191    False
192    False
193    False
194    False

In [73]:
health_warn.loc[no_mandate, 'warn_score'] = 0

In [74]:
health_warn.head()

Unnamed: 0,Country,HEALTH WARNINGS ARE MANDATED,AVERAGE OF FRONT AND REAR\n%,FRONT\n%,REAR\n%,DOES THE LAW MANDATE SPECIFIC HEALTH WARNINGS ON PACKAGES?,HOW MANY HEALTH WARNINGS ARE APPROVED BY THE LAW?,DO HEALTH WARNINGS APPEAR ON EACH PACKAGE AND ANY OUTSIDE PACKAGING AND LABELLING USED IN THE RETAIL SALE?,DO HEALTH WARNINGS DESCRIBE THE HARMFUL EFFECTS OF TOBACCO USE ON HEALTH?,"DOES THE LAW MANDATE FONT STYLE, FONT SIZE AND COLOUR OF HEALTH WARNINGS?",ARE THE HEALTH WARNINGS ROTATING?,ARE THE HEALTH WARNINGS WRITTEN IN THE PRINCIPAL LANGUAGE(S) OF THE COUNTRY?,DO THE HEALTH WARNINGS INCLUDE A PHOTOGRAPH OR GRAPHIC?,warn_score
0,Afghanistan,Yes,—,—,—,No,—,Yes,No,No,No,No,No,—
1,Albania,Yes,50,50,50,Yes,16,Yes,Yes,Yes,Yes,Yes,No,50
2,Algeria,Yes,15,15,15,Yes,6,Yes,Yes,No,Yes,Yes,No,15
3,Andorra,No,—,—,—,—,—,—,—,—,—,—,—,0
4,Angola,No,—,—,—,—,—,—,—,—,—,—,—,0


In [75]:
drop_index = health_warn[health_warn['warn_score'] == '—'].index

In [76]:
health_warn.drop(drop_index, inplace=True)

In [77]:
health_warn.head()

Unnamed: 0,Country,HEALTH WARNINGS ARE MANDATED,AVERAGE OF FRONT AND REAR\n%,FRONT\n%,REAR\n%,DOES THE LAW MANDATE SPECIFIC HEALTH WARNINGS ON PACKAGES?,HOW MANY HEALTH WARNINGS ARE APPROVED BY THE LAW?,DO HEALTH WARNINGS APPEAR ON EACH PACKAGE AND ANY OUTSIDE PACKAGING AND LABELLING USED IN THE RETAIL SALE?,DO HEALTH WARNINGS DESCRIBE THE HARMFUL EFFECTS OF TOBACCO USE ON HEALTH?,"DOES THE LAW MANDATE FONT STYLE, FONT SIZE AND COLOUR OF HEALTH WARNINGS?",ARE THE HEALTH WARNINGS ROTATING?,ARE THE HEALTH WARNINGS WRITTEN IN THE PRINCIPAL LANGUAGE(S) OF THE COUNTRY?,DO THE HEALTH WARNINGS INCLUDE A PHOTOGRAPH OR GRAPHIC?,warn_score
1,Albania,Yes,50,50,50,Yes,16,Yes,Yes,Yes,Yes,Yes,No,50
2,Algeria,Yes,15,15,15,Yes,6,Yes,Yes,No,Yes,Yes,No,15
3,Andorra,No,—,—,—,—,—,—,—,—,—,—,—,0
4,Angola,No,—,—,—,—,—,—,—,—,—,—,—,0
5,Antigua and Barbuda,Yes,50,50,50,No,—,Yes,No,Yes,Yes,Yes,No,50


In [78]:
bans_dir.to_csv('data/clean/Characteristics-of-health-warnings-on-cigarette-packages.csv')

### Legislation of smoking in public places

Re stands for restricted permision to smoke.

In [91]:
leg_pl = pd.read_csv('data/raw/Public-places-with-smoke-free-legislation.csv')

In [92]:
leg_pl.head()

Unnamed: 0,COUNTRY,HEALTH-CARE FACILITIES,EDUCATIONAL FACILITIES EXCEPT UNIVERSITIES,UNIVERSITIES,GOVERNMENT FACILITIES,INDOOR OFFICES,RESTAURANTS,PUBS AND BARS,PUBLIC TRANSPORT,ALL OTHER INDOOR PUBLIC PLACES
0,Afghanistan,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No
1,Albania,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes
2,Algeria,Yes,Yes,Yes,No,No,No,Yes,No,—
3,Andorra,Yes,Yes,Yes,Yes,Re,Re,Re,Yes,—
4,Angola,Yes,Yes,Yes,Yes,No,No,No,Yes,—


In [93]:
# This column is droped as the most of its entries are blank
leg_pl.drop(['ALL OTHER INDOOR PUBLIC PLACES'], axis=1, inplace=True)

In [94]:
cols_selected = leg_pl.columns[1:]

In [95]:
leg_pl['Ban_Score'] = leg_pl[cols_selected].eq("Yes").apply(sum, axis=1)

In [96]:
cols = leg_pl.columns[[0,-1]]
leg_pl = leg_pl[cols]

In [97]:
bans_dir.to_csv('data/clean/Public-places-with-smoke-free-legislation.csv')