In [7]:
import os

from airtable import Airtable
import pandas as pd
import numpy as np
import seaborn as sns

from airtable_forms.common import config

# Make one dataframe with all CSV files

In [8]:
os.listdir('dictionaries-2lines')

['Brazil_2018_dictionary.csv',
 'EU_2019_dictionary.csv',
 'UK_2019_dictionary.csv',
 'UK_2017_dictionary.csv',
 'Germany_2017_dictionary.csv',
 'Mexico_2018_dictionary.csv',
 'Sweden_2018_dictionary.csv',
 'US_Midterm_2018_dictionary.csv']

In [14]:
def read_dict(fn):
    df = pd.read_csv('dictionaries-2lines/'+fn, keep_default_na=False, na_values=['_'])
    df['dictionary'] = fn.replace('_dictionary.csv','').replace('_', ' ')
    return df

dfs = [ read_dict(x) for x in os.listdir('dictionaries-2lines') ]
df = pd.concat(dfs)

# Quick cleanup

In [15]:
df[df['base_url'].str.contains(r'[^-a-z0-9.]')]

Unnamed: 0,base_url,code,dictionary


In [16]:
df = df[~df['base_url'].str.contains(r'[^-a-z0-9.]')]

In [17]:
df[pd.isna(df['code'])]

Unnamed: 0,base_url,code,dictionary


# Some descriptive stats

In [18]:
len(df)

8

In [19]:
df['dictionary'].value_counts()

Sweden 2018        1
Germany 2017       1
Brazil 2018        1
EU 2019            1
Mexico 2018        1
US Midterm 2018    1
UK 2019            1
UK 2017            1
Name: dictionary, dtype: int64

In [20]:
df['code'].value_counts()

JN     2
PN     1
PP     1
CC     1
L      1
PB     1
PN1    1
Name: code, dtype: int64

In [21]:
vc = df['base_url'].value_counts()
vc[vc>5]

Series([], Name: base_url, dtype: int64)

In [22]:
pd.DataFrame(vc.values)[0].value_counts()

1    8
Name: 0, dtype: int64

In [23]:
len(df['base_url'].unique())

8

# Import dictionaries (called "Projects" for now)

In [24]:
airtable = Airtable(config["base-key"], "Projects", api_key=config["api-key"])

In [25]:
dicts = df['dictionary'].unique()
dicts

array(['Brazil 2018', 'EU 2019', 'UK 2019', 'UK 2017', 'Germany 2017',
       'Mexico 2018', 'Sweden 2018', 'US Midterm 2018'], dtype=object)

In [26]:
for d in dicts:
    airtable.insert({'Name':d})

# Import categories

In [27]:
airtable = Airtable(config["base-key"], "Categories", api_key=config["api-key"])

In [28]:
df.groupby('code')['dictionary'].unique()

code
CC                       [Mexico 2018]
JN     [Germany 2017, US Midterm 2018]
L                            [UK 2017]
PB                       [Brazil 2018]
PN                       [Sweden 2018]
PN1                          [UK 2019]
PP                           [EU 2019]
Name: dictionary, dtype: object

In [29]:
for code, dicts in df.groupby('code')['dictionary'].unique().items():
    airtable.insert({'Code': code, 'Project': list(dicts)}, typecast=True)

# Import sources (base_urls)

In [30]:
airtable = Airtable(config["base-key"], "Sources", api_key=config["api-key"])

In [31]:
df['base_url'].unique()

array(['abuladomercado.com.br', '0vinz.wordpress.com', '20minutes.fr',
       '1203pl.puheenvuoro.uusisuomi.fi', '04091965.de',
       '10pormexiconatural.mx', '101kgb.iheart.com',
       '100percentfedup.com'], dtype=object)

In [32]:
for source in df['base_url'].unique():
    airtable.insert({'Base URL': source})

# Insert coding decisions

In [33]:
airtable = Airtable(config["base-key"], "Coding decisions", api_key=config["api-key"])

In [34]:
for idx, row in df.iterrows():
    airtable.insert(
        {'Source': row['base_url'], 'Category': row['code'], 'Project': row['dictionary']},
        typecast=True
    )