In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
# Let's load the data for the first time
df = pd.read_pickle(os.path.join('data', 'artwork_data.pickle'))

In [4]:
# ITERATION
small_df = df.iloc[49980:50019, :].copy()
small_df

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16412,Wols,[no title],Etching on paper,1937.0,1983.0,130.0,80.0,mm
16413,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,187.0,102.0,mm
16414,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,248.0,168.0,mm
16415,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,149.0,102.0,mm
16416,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,203.0,120.0,mm
16417,Wols,[no title],Etching on paper,1937.0,1983.0,130.0,79.0,mm
16418,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,124.0,98.0,mm
16419,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,140.0,89.0,mm
16420,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,140.0,86.0,mm
16421,Wols,[no title],Etching and drypoint on paper,1937.0,1983.0,83.0,79.0,mm


In [6]:
grouped = small_df.groupby('artist')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000189FCDFDB80>

In [7]:
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [10]:
for name, group_df in grouped:
    print(name)
    print('----------------------')
    print(group_df)
    break

Frost, Sir Terry
----------------------
                artist            title               medium    year  \
id                                                                     
4704  Frost, Sir Terry        Blue Moon  Lithograph on paper  1952.0   
4705  Frost, Sir Terry      Boat Shapes     Linocut on paper  1952.0   
4706  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4707  Frost, Sir Terry      Boat Shapes     Linocut on paper  1954.0   
4708  Frost, Sir Terry            Leeds    Drypoint on paper  1956.0   
4709  Frost, Sir Terry  Camping, Anduze     Etching on paper  1979.0   
4710  Frost, Sir Terry     Umea, Sweden     Etching on paper  1979.0   
4711  Frost, Sir Terry    Self-Portrait     Etching on paper  1980.0   

      acquisitionYear width height units  
id                                        
4704           1983.0   355    273    mm  
4705           1983.0   132    143    mm  
4706           1983.0   131    155    mm  
4707           1983.0   

In [11]:
# Aggregate
# Mins
for name, group_df in small_df.groupby('artist'): 
    min_year = group_df['acquisitionYear'].min()
    print("{}: {}".format(name, min_year))

Frost, Sir Terry: 1983.0
Phillips, Esq Tom: 1983.0
Wols: 1983.0


In [12]:
# Transform
# Equivalent of editing by hand:
# Make a case when there is no data to infer
# small_df.loc[[11838, 16441], 'medium'] = np.nan
def fill_values(series):
    values_counted = series.value_counts()
    if values_counted.empty:
        return series
    most_frequent = values_counted.index[0]
    new_medium = series.fillna(most_frequent)
    return new_medium

In [13]:
def transform_df(source_df):
    group_dfs = []  
    for name, group_df in source_df.groupby('artist'):
        filled_df = group_df.copy()
        filled_df.loc[:, 'medium'] = fill_values(group_df['medium'])
        group_dfs.append(filled_df)
    
    new_df = pd.concat(group_dfs)
    return new_df

In [14]:
# Now check the result
filled_df = transform_df(small_df)
filled_df

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4704,"Frost, Sir Terry",Blue Moon,Lithograph on paper,1952.0,1983.0,355.0,273.0,mm
4705,"Frost, Sir Terry",Boat Shapes,Linocut on paper,1952.0,1983.0,132.0,143.0,mm
4706,"Frost, Sir Terry",Boat Shapes,Linocut on paper,1954.0,1983.0,131.0,155.0,mm
4707,"Frost, Sir Terry",Boat Shapes,Linocut on paper,1954.0,1983.0,193.0,267.0,mm
4708,"Frost, Sir Terry",Leeds,Drypoint on paper,1956.0,1983.0,125.0,167.0,mm
4709,"Frost, Sir Terry","Camping, Anduze",Etching on paper,1979.0,1983.0,257.0,209.0,mm
4710,"Frost, Sir Terry","Umea, Sweden",Etching on paper,1979.0,1983.0,,,
4711,"Frost, Sir Terry",Self-Portrait,Etching on paper,1980.0,1983.0,277.0,200.0,mm
11838,"Phillips, Esq Tom",[colophon],Etching on paper,1979.0,1983.0,292.0,204.0,mm
16412,Wols,[no title],Etching on paper,1937.0,1983.0,130.0,80.0,mm


In [15]:
# BUILT-INS
# Transform
grouped_mediums = small_df.groupby('artist')['medium']
grouped_mediums

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000189B074C0D0>

In [16]:
small_df.loc[:, 'medium'] = grouped_mediums.transform(fill_values)

In [17]:
# Min
df.groupby('artist').agg(np.min)

  df.groupby('artist').agg(np.min)


Unnamed: 0_level_0,title,acquisitionYear
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
?British School,"Portrait of a Gentleman, probably of the West ...",1927.0
"Abakanowicz, Magdalena",Abakan Orange,2009.0
"Abbey, Edwin Austin",Illustration to ‘Judith Shakespeare’,1924.0
"Abbott, Berenice",Dinty Moore Antiques,2010.0
"Abbott, Lemuel Francis","Henry Byne, of Carshalton",1885.0
...,...,...
"Zuloaga, Ignacio",View of the Escorial,1923.0
"Zyw, Aleksander",Light,1962.0
"di Suvero, Mark",Tetra,2004.0
"van Elk, Ger",Lunch II,1980.0
