In [232]:
import pandas as pd
import re
import os

## Import Data from Multiple .csv Files

In [233]:
# file directory to import from
data_files = os.listdir('../data_tues/agencies/')
data_files

['2019-media-agencies.csv',
 '2020-effective-agencies.csv',
 '2018-effective-agencies.csv',
 '2019-effective-agencies.csv',
 '2018-creative-agencies.csv',
 '2020-media-agencies.csv',
 '2020-creative-agencies-.csv',
 '2019-creative-agencies.csv',
 '2018-media-agencies.csv']

In [234]:
# function to import data from csv and add columns for year and category from file name
def load_files(filenames):
    regex = re.compile(r'^(\d{4})-(\w+)-agencies\.csv$')
    matches = [m for m in map(regex.match, filenames) if m is not None]

    for match in matches:
	    yield (
            pd.read_csv('../data_tues/agencies/' + match.group(0))
            .assign(year=match.group(1), ranking_category=match.group(2)) 
            .astype({'year': 'int', 'ranking_category': 'string'}) 
        )
		
agencies_data = pd.concat(load_files(data_files))
agencies_data

Unnamed: 0,Rank,Agency Name,City,Agency Type,Agency Location,Independent?,Product Category,Points,year,ranking_category
0,1,MediaCom Connections,Tel Aviv,Creative Agency,Israel,,"Household & Domestic, Toiletries & Cosmetics",201.7,2019,media
1,2,Touché!,Montreal,Creative Agency,Canada,,"Retail, Soft Drinks, Transport & Tourism",194.0,2019,media
2,3,Mindshare,Shanghai,Creative Agency,China (Mainland),,"Clothing & Accessories, Food, Retail",186.0,2019,media
3,4,UM,Sydney,Creative Agency,Australia,,"Leisure & Entertainment, Soft Drinks, Telecoms...",182.7,2019,media
4,5,MediaCom,London,Creative Agency,UK,,"Business & Industrial, Food, Non-profit, publi...",150.4,2019,media
...,...,...,...,...,...,...,...,...,...,...
1604,,,,,,,,,2018,media
1605,,,,,,,,,2018,media
1606,,,,,,,,,2018,media
1607,,,,,,,,,2018,media


## Clean up the Data

In [235]:
# remove na rows (from the bottom of the excel files)
agencies_data = agencies_data.dropna(subset=['Agency Name'])
agencies_data = agencies_data.drop(columns=['Rank'])

In [236]:
# set "Independent" column to boolean fill with true and false vals
d = {'X': True}
agencies_data["Independent?"] = agencies_data["Independent?"].replace(d)
agencies_data['Independent?'] = agencies_data["Independent?"].fillna(False)

In [237]:
#rename the columns
agencies_data = agencies_data.rename(columns={'Agency Name': "name", 'City': 'city', 'Agency Type': 'type', 'Agency Location': 'country', 'Independent?': 'independent', 'Product Category': 'product_category', 'Points': 'points'})

agencies_data

Unnamed: 0,name,city,type,country,independent,product_category,points,year,ranking_category
0,MediaCom Connections,Tel Aviv,Creative Agency,Israel,False,"Household & Domestic, Toiletries & Cosmetics",201.7,2019,media
1,Touché!,Montreal,Creative Agency,Canada,False,"Retail, Soft Drinks, Transport & Tourism",194.0,2019,media
2,Mindshare,Shanghai,Creative Agency,China (Mainland),False,"Clothing & Accessories, Food, Retail",186.0,2019,media
3,UM,Sydney,Creative Agency,Australia,False,"Leisure & Entertainment, Soft Drinks, Telecoms...",182.7,2019,media
4,MediaCom,London,Creative Agency,UK,False,"Business & Industrial, Food, Non-profit, publi...",150.4,2019,media
...,...,...,...,...,...,...,...,...,...
430,The Electric Factory,Montevideo,Creative Agency,Uruguay,True,Toiletries & Cosmetics,3.0,2018,media
431,Wunderman Phantasia,Lima,Creative Agency,Peru,False,Telecoms & Utilities,3.0,2018,media
432,Laundry Service,London,Creative Agency,UK,True,Technology & Electronics,2.9,2018,media
433,m/SIX,London,Creative Agency,UK,False,Transport & Tourism,2.9,2018,media


## Extract Data for sql Tables and set IDs

### Product Categories Table

In [238]:
# get a list of product categories
product_categories = [val.strip() for sublist in agencies_data["product_category"].dropna().str.split(",").tolist() for val in sublist]
product_categories = list(set(product_categories))
product_categories = pd.DataFrame(product_categories,columns=['name'])
product_categories.insert(0, 'id', range(1, 1 + len(product_categories)))
# product_categories.set_index('id', inplace=True)
product_categories


Unnamed: 0,id,name
0,1,Non-profit
1,2,Food
2,3,Pharma & Healthcare
3,4,Clothing & Accessories
4,5,Household & Domestic
5,6,Leisure & Entertainment
6,7,Tobacco
7,8,Soft Drinks
8,9,Transport & Tourism
9,10,Politics


### Agency Table
1. Get a unique set of agencies - this is a composite of name, city, and type. 
2. Create an "id" for each unique agency.
3. Merge the "id"'s back into the original table.
4. Remove superfluous columns

In [239]:
# get a unique list of agencies
agency = agencies_data.groupby(['name' ,'city' , 'type']).agg({'country': 'first', 'independent': 'first'}).reset_index()

#create a unique id for the composite
agency.insert(0, 'id', range(1, 1 + len(agency)))
# agency.set_index('id', inplace=True)
agency


Unnamed: 0,id,name,city,type,country,independent
0,1,!eatme,Warsaw,Digital/Specialist Agency,Poland,True
1,2,&Co,Copenhagen,Creative Agency,Denmark,True
2,3,&Rosas,Barcelona,Creative Agency,Spain,True
3,4,( anonimo ),Mexico City,Creative Agency,Mexico,True
4,5,(anónimo),Mexico City,Creative Agency,Mexico,True
...,...,...,...,...,...,...
4141,4142,zMessenger,Colombo,Creative Agency,Sri Lanka,True
4142,4143,Åkestam Holst,Stockholm,Creative Agency,Sweden,True
4143,4144,Öppet,Stockholm,Creative Agency,Sweden,True
4144,4145,Über Dijital,Istanbul,Digital/Specialist Agency,Turkey,True


In [240]:
merged_df = pd.merge(agencies_data, agency, how="left", on=["name", "city", "type"])
merged_df = merged_df.drop(columns=['country_y', 'independent_y' ])
merged_df = merged_df.rename(columns={'country_x': "country", 'independent_x': 'independent'})
id_col = merged_df['id']
merged_df.drop(labels=['id'], axis=1,inplace = True)
merged_df.insert(0, 'id', id_col)
merged_df

Unnamed: 0,id,name,city,type,country,independent,product_category,points,year,ranking_category
0,2161,MediaCom Connections,Tel Aviv,Creative Agency,Israel,False,"Household & Domestic, Toiletries & Cosmetics",201.7,2019,media
1,3559,Touché!,Montreal,Creative Agency,Canada,False,"Retail, Soft Drinks, Transport & Tourism",194.0,2019,media
2,2273,Mindshare,Shanghai,Creative Agency,China (Mainland),False,"Clothing & Accessories, Food, Retail",186.0,2019,media
3,3646,UM,Sydney,Creative Agency,Australia,False,"Leisure & Entertainment, Soft Drinks, Telecoms...",182.7,2019,media
4,2122,MediaCom,London,Creative Agency,UK,False,"Business & Industrial, Food, Non-profit, publi...",150.4,2019,media
...,...,...,...,...,...,...,...,...,...,...
6358,3482,The Electric Factory,Montevideo,Creative Agency,Uruguay,True,Toiletries & Cosmetics,3.0,2018,media
6359,3939,Wunderman Phantasia,Lima,Creative Agency,Peru,False,Telecoms & Utilities,3.0,2018,media
6360,1787,Laundry Service,London,Creative Agency,UK,True,Technology & Electronics,2.9,2018,media
6361,4112,m/SIX,London,Creative Agency,UK,False,Transport & Tourism,2.9,2018,media


### Create the Agency Table

In [241]:
# create the agency table
agency_df = merged_df.filter(["id", "name", "city", "type", "country", "independent"], axis=1)
agency_df

Unnamed: 0,id,name,city,type,country,independent
0,2161,MediaCom Connections,Tel Aviv,Creative Agency,Israel,False
1,3559,Touché!,Montreal,Creative Agency,Canada,False
2,2273,Mindshare,Shanghai,Creative Agency,China (Mainland),False
3,3646,UM,Sydney,Creative Agency,Australia,False
4,2122,MediaCom,London,Creative Agency,UK,False
...,...,...,...,...,...,...
6358,3482,The Electric Factory,Montevideo,Creative Agency,Uruguay,True
6359,3939,Wunderman Phantasia,Lima,Creative Agency,Peru,False
6360,1787,Laundry Service,London,Creative Agency,UK,True
6361,4112,m/SIX,London,Creative Agency,UK,False


### Create Agency Market Rank Table

In [242]:
# create the agency_market_rank dataframe
agency_market_rank = merged_df.filter(["ranking_category", "year", "id", "points"], axis=1)
agency_market_rank = agency_market_rank.rename(columns={'id': "agency_id"})
agency_market_rank


Unnamed: 0,ranking_category,year,agency_id,points
0,media,2019,2161,201.7
1,media,2019,3559,194.0
2,media,2019,2273,186.0
3,media,2019,3646,182.7
4,media,2019,2122,150.4
...,...,...,...,...
6358,media,2018,3482,3.0
6359,media,2018,3939,3.0
6360,media,2018,1787,2.9
6361,media,2018,4112,2.9


## Agency Product Category Table
This table contains an id for the agency and an id for the product category

1. From "merged_df" split out the product categories and save each of them as a row along with the "id" of the agency.
2. Merge these with "product_categories" df on the "name".
2. Drop, Rename and reorder columns.

In [243]:
# create a series by concatenating each split value then iterate rows to write df
agency_product_category_df = pd.concat([pd.Series(row['id'], row['product_category'].split(','))              
    for _, row in merged_df.iterrows()]).reset_index()

agency_product_category_df = agency_product_category_df.rename(columns={0:"agency_id", "index": "name"})

# strip any leading or trailing spaces
agency_product_category_df['name'] = agency_product_category_df['name'].str.strip()

agency_product_category_df
                    

Unnamed: 0,name,agency_id
0,Household & Domestic,2161
1,Toiletries & Cosmetics,2161
2,Retail,3559
3,Soft Drinks,3559
4,Transport & Tourism,3559
...,...,...
10688,Toiletries & Cosmetics,3482
10689,Telecoms & Utilities,3939
10690,Technology & Electronics,1787
10691,Transport & Tourism,4112


In [245]:
# merge and clean up
# merged_product_category = pd.merge(agency_product_category_df, product_categories, how="left", on=["name"])
merged_product_category = merged_product_category.rename(columns={'id': "product_category_id"})
merged_product_category = merged_product_category.drop(columns=['name'])

merged_product_category

Unnamed: 0,agency_id,product_category_id
0,2161,5
1,2161,19
2,3559,17
3,3559,8
4,3559,9
...,...,...
10688,3482,19
10689,3939,15
10690,1787,11
10691,4112,9


## Brand Market Rank Table
1. Import .csv files
2. Clean up and add brand id.
3. Create product_category id.

In [246]:
# file directory to import from
brand_files = os.listdir('../data_tues/brands/')
brand_files

['2018-media-brands.csv',
 '2019-effective-brands.csv',
 '2019-creative-brands.csv',
 '2018-creative-brands.csv',
 '2019-media-brands.csv',
 '2020-effective-brands.csv',
 '2018-effective-brands.csv',
 '2020-creative-brands.csv',
 '2020-media-brands.csv']

In [250]:
# function to import data from csv and add columns for year and category from file name
def load_files(filenames):
    regex = re.compile(r'^(\d{4})-(\w+)-brands\.csv$')
    matches = [m for m in map(regex.match, filenames) if m is not None]

    for match in matches:
	    yield (
            pd.read_csv('../data_tues/brands/' + match.group(0))
            .assign(year=match.group(1), ranking_category=match.group(2)) 
            .astype({'year': 'int', 'ranking_category': 'string'}) 
        )
		
brands_data = pd.concat(load_files(brand_files))
brands_data = brands_data.rename(columns={'Brand': "brand", 'Product Category': 'product_category', "Points": "points"})

brands_data = brands_data.filter(["ranking_category", "year", "brand", "product_category", "points"], axis=1)

brands_data = brands_data.dropna(subset=['brand'])


brands_data

Unnamed: 0,ranking_category,year,brand,product_category,points
0,media,2018,Nike,Clothing & Accessories,297.5
1,media,2018,McDonald's,Retail,263.7
2,media,2018,Snickers,Food,227.9
3,media,2018,Dove,Toiletries & Cosmetics,204.5
4,media,2018,Netflix,Media & Publishing,194.5
...,...,...,...,...,...
506,media,2020,Eclipse,Food,2.1
507,media,2020,BBVA,Financial Services,2.0
508,media,2020,Gasco,Business & Industrial,2.0
509,media,2020,Asociación Mutual Israelita Argentina,"Non-profit, public sector & education",2.0


In [253]:
# merge in the product category id and drop the product category column
# strip any leading or trailing spaces
brands_data['name'] = brands_data['product_category'].str.strip()

# merge tables
merged_brands_df = pd.merge(brands_data, product_categories, how="left", left_on=["product_category"], right_on=["name"])

merged_brands_df = merged_brands_df.rename(columns={'id': "product_category_id"})

merged_brands_df = merged_brands_df.filter(["ranking_category", "year", "brand", "product_category_id", "points"], axis=1)

merged_brands_df

Unnamed: 0,ranking_category,year,brand,product_category_id,points
0,media,2018,Nike,4.0,297.5
1,media,2018,McDonald's,17.0,263.7
2,media,2018,Snickers,2.0,227.9
3,media,2018,Dove,19.0,204.5
4,media,2018,Netflix,12.0,194.5
...,...,...,...,...,...
4812,media,2020,Eclipse,2.0,2.1
4813,media,2020,BBVA,14.0,2.0
4814,media,2020,Gasco,20.0,2.0
4815,media,2020,Asociación Mutual Israelita Argentina,,2.0
