In [1]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

In [3]:
def get_articles(cat_main):
    
    articles_all = []
    
    for p in cat_main.categorymembers.values():

        if p.namespace == wikipediaapi.Namespace.CATEGORY and ('painter' in p.title or 'Painter' in p.title):
            articles_all.extend(get_articles(p))
        
        elif p.namespace == wikipediaapi.Namespace.MAIN:
            articles_all.append(p)

    return articles_all

In [4]:
def get_df_for_cat_value(value_cat_lst, clean_func, valueName):
    
    pages_all = []
    pages_name_all = []
    value_all = []

    for value in tqdm(value_cat_lst):
        
        pages_oneV = get_articles(value)
        pages_name_oneV = [p.title for p in pages_oneV]

        value_clean = clean_func(value)
        value_oneV = [value_clean for p in pages_oneV]

           
        pages_all.extend(pages_oneV)
        pages_name_all.extend(pages_name_oneV)
        value_all.extend(value_oneV)
    
    print('Number of pages before deleting duplicates: ', len(pages_all))
    
    columns = ['page', 'page name', valueName]
    df = pd.DataFrame(columns=columns) 

    df['page'] = pages_all
    df['page name'] = pages_name_all
    df[valueName] = value_all
    
    df = df.groupby('page name').agg({'page':'first', 
                                       valueName: set}).reset_index()
    
    df[valueName] = df[valueName].apply(list)

    
    return df

## Nationalities

In [5]:
def clean_nationality(nat_cat):
    nat = nat_cat.title
    
    nat = nat.split('Category:')[1]
    
    if 'from Georgia' in nat:
        return 'Georgian'
    if 'from the Principality' in nat:
        return 'Liège'
    return nat.split(' painters')[0]

In [6]:
nationalities_cat_lst = []

for nat in  wiki_wiki.page("Category:Painters_by_nationality").categorymembers.values():
    nationalities_cat_lst.append(nat)
    
nationalities_cat_lst = nationalities_cat_lst[8:]

In [7]:
df_nationality = get_df_for_cat_value(value_cat_lst=nationalities_cat_lst, 
                                      clean_func=clean_nationality, 
                                      valueName='nationality')

100%|████████████████████████████████████████████████████████████████████████████████| 170/170 [05:19<00:00,  1.88s/it]


Number of pages before deleting duplicates:  131442


In [8]:
df_nationality

Unnamed: 0,page name,page,nationality
0,'Abd al-Hayy,"'Abd al-Hayy (id: ??, ns: 0)","[Persian, Iranian]"
1,'the other' Jan van Kessel,"'the other' Jan van Kessel (id: ??, ns: 0)",[Flemish]
2,108 (artist),"108 (artist) (id: ??, ns: 0)",[Italian]
3,3Steps,"3Steps (id: ??, ns: 0)",[German]
4,A. A. Raiba,"A. A. Raiba (id: ??, ns: 0)",[Indian]
...,...,...,...
37045,Živorad Nastasijević,"Živorad Nastasijević (id: ??, ns: 0)",[Serbian]
37046,Ștefan Câlția,"Ștefan Câlția (id: ??, ns: 0)",[Romanian]
37047,Ștefan Dimitrescu,"Ștefan Dimitrescu (id: ??, ns: 0)",[Romanian]
37048,Ștefan Luchian,"Ștefan Luchian (id: ??, ns: 0)",[Romanian]


## Dates and Other Info

In [9]:
def get_dates(page):
    cats = page.categories.keys()

    death_year, birth_year = None, None
    
    for c in cats:

        if 'births' in c:
            match = re.search(r'\d{4}', c) 
            if match: birth_year = match[0]

        if 'deaths' in c:
            match = re.search(r'\d{4}', c) 
            if match: death_year = match[0]

    return [birth_year, death_year, page.fullurl, list(cats)]

In [20]:

    births = []
    deaths = []
    urls = []
    categories_all = []


    for p in tqdm(df_nationality[21719:]['page']):
        b, d, u, cs = get_dates(p)

        births.append(b)
        deaths.append(d)
        urls.append(u)
        categories_all.append(cs)
        
        

100%|██████████████████████████████████████████████████████████████████████████| 15331/15331 [1:41:27<00:00,  2.52it/s]


In [21]:
len(births)

15331

## Other Info

# Save

In [11]:
def clean_title(title):
    if ' (' in title:
        return title.split(' (')[0]
    return title

In [22]:
columns = ['page', 'page name', 'painter name', 'url', 'birth', 'death', 'nationality', 'categories']
df = pd.DataFrame(columns=columns)


df['page'] = df_nationality['page']
df['page name'] = df_nationality['page name']
df['painter name'] = df_nationality['page name'].apply(clean_title)
df['url'] = urls
df['birth'] = births
df['death'] = deaths
df['nationality'] = df_nationality['nationality']
df['categories'] = categories_all


df

Unnamed: 0,page,page name,painter name,url,birth,death,nationality,categories
21719,"Katharina Rapp (id: 1628554, ns: 0)",Katharina Rapp,Katharina Rapp,https://en.wikipedia.org/wiki/Katharina_Rapp,1948,,[German],"[Category:1948 births, Category:20th-century G..."
21720,"Katharine Carl (id: 8544717, ns: 0)",Katharine Carl,Katharine Carl,https://en.wikipedia.org/wiki/Katharine_Carl,1865,1938,[American],"[Category:1865 births, Category:1938 deaths, C..."
21721,"Katharine Church (id: 56389959, ns: 0)",Katharine Church,Katharine Church,https://en.wikipedia.org/wiki/Katharine_Church,1910,1999,[British],"[Category:1910 births, Category:1999 deaths, C..."
21722,"Katharine Emma Maltwood (id: 10726642, ns: 0)",Katharine Emma Maltwood,Katharine Emma Maltwood,https://en.wikipedia.org/wiki/Katharine_Emma_M...,1878,1961,[British],"[Category:1878 births, Category:1961 deaths, C..."
21723,"Katharine Lane Weems (id: 11172863, ns: 0)",Katharine Lane Weems,Katharine Lane Weems,https://en.wikipedia.org/wiki/Katharine_Lane_W...,1899,1989,[American],"[Category:1899 births, Category:1989 deaths, C..."
...,...,...,...,...,...,...,...,...
37045,"Živorad Nastasijević (id: 62254857, ns: 0)",Živorad Nastasijević,Živorad Nastasijević,https://en.wikipedia.org/wiki/%C5%BDivorad_Nas...,1893,1966,[Serbian],"[Category:1893 births, Category:1966 deaths, C..."
37046,"Ștefan Câlția (id: 13409891, ns: 0)",Ștefan Câlția,Ștefan Câlția,https://en.wikipedia.org/wiki/%C8%98tefan_C%C3...,1942,,[Romanian],"[Category:1942 births, Category:All stub artic..."
37047,"Ștefan Dimitrescu (id: 6000470, ns: 0)",Ștefan Dimitrescu,Ștefan Dimitrescu,https://en.wikipedia.org/wiki/%C8%98tefan_Dimi...,1886,1933,[Romanian],"[Category:1886 births, Category:1933 deaths, C..."
37048,"Ștefan Luchian (id: 2475912, ns: 0)",Ștefan Luchian,Ștefan Luchian,https://en.wikipedia.org/wiki/%C8%98tefan_Luchian,1868,1917,[Romanian],"[Category:1868 births, Category:1917 deaths, C..."


In [23]:
df.to_csv('Data_GoogleArts_V1.csv')

## Load Test