In [4]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import geopandas as gpd # GeoPandas library for spatial analytics
from shapely.geometry import shape

from urllib.parse import urlencode
import urllib.request, json 

import pyproj 

import matplotlib.pyplot as plt
import seaborn as sns # visualization styling package
%matplotlib inline 

from stop_words import get_stop_words
import matplotlib.pyplot as plt
from pprint import pprint

In [5]:
dataset = pd.read_csv('Data/photo_cleandata.csv')
dataset

Unnamed: 0,objectid,classification,endyear,medium,title,uuid,iiifurl,iiifthumburl,width,height,maxpixels,created
0,92376,Photograph,1820-1840,photogenic drawing,Leaf Study,13bd075b-7384-4066-9e2c-3c983848f110,https://api.nga.gov/iiif/13bd075b-7384-4066-9e...,https://api.nga.gov/iiif/13bd075b-7384-4066-9e...,2791,3407,,2007-02-21 12:32:01-05
1,212101,Photograph,1840-1860,salted paper print,"Winter Trees, Reflected in a Pond",16297e08-a8d1-457e-9a8d-2dbe15421f07,https://api.nga.gov/iiif/16297e08-a8d1-457e-9a...,https://api.nga.gov/iiif/16297e08-a8d1-457e-9a...,5482,4708,,2018-02-06 10:23:38-05
2,95789,Photograph,1840-1860,salted paper print,"An Ancient Door in Magdalen College, Oxford",3d09b7a8-22b3-408d-a218-e12601cdd2b8,https://api.nga.gov/iiif/3d09b7a8-22b3-408d-a2...,https://api.nga.gov/iiif/3d09b7a8-22b3-408d-a2...,4051,3173,,2011-01-21 15:48:21-05
3,102589,Photograph,1840-1860,salted paper print,The Boulevards of Paris,bb029ac1-a5eb-4d1e-99b8-887a63dc14c0,https://api.nga.gov/iiif/bb029ac1-a5eb-4d1e-99...,https://api.nga.gov/iiif/bb029ac1-a5eb-4d1e-99...,3867,3173,,2011-01-21 15:47:05-05
4,106469,Photograph,1840-1860,salted paper print,Orléans Cathedral,7bd02093-815b-47bb-ba46-353c777777bb,https://api.nga.gov/iiif/7bd02093-815b-47bb-ba...,https://api.nga.gov/iiif/7bd02093-815b-47bb-ba...,4014,3245,,2011-01-21 16:15:39-05
...,...,...,...,...,...,...,...,...,...,...,...,...
19746,224750,Photograph,2020-2040,inkjet print,Florida,f46a143e-143a-44f3-a147-1756d5d88094,https://api.nga.gov/iiif/f46a143e-143a-44f3-a1...,https://api.nga.gov/iiif/f46a143e-143a-44f3-a1...,8400,12247,640.0,2022-12-01 22:07:01-05
19747,224920,Photograph,2020-2040,inkjet print,Untitled (Insurrection),926ff908-dbbb-4aa7-a76a-f63d721121be,https://api.nga.gov/iiif/926ff908-dbbb-4aa7-a7...,https://api.nga.gov/iiif/926ff908-dbbb-4aa7-a7...,12001,7989,,2022-12-01 20:03:52-05
19748,224771,Photograph,2020-2040,gelatin silver print,The Brown Sisters,f6705397-8ae2-43b9-822c-9755ace25131,https://api.nga.gov/iiif/f6705397-8ae2-43b9-82...,https://api.nga.gov/iiif/f6705397-8ae2-43b9-82...,8599,6866,640.0,2022-06-09 12:35:39-04
19749,225842,Photograph,2020-2040,inkjet print,Flatboat,c046bf90-032d-4232-b3d0-d74d5ad0740c,https://api.nga.gov/iiif/c046bf90-032d-4232-b3...,https://api.nga.gov/iiif/c046bf90-032d-4232-b3...,6000,4884,640.0,2022-07-06 14:41:05-04


In [6]:
df_group = dataset[['objectid', 'medium', 'endyear']].groupby(['medium','endyear']).count().reset_index().rename(columns={'objectid': 'counts'})
df_group = df_group.sort_values(by=['counts'], ascending=False)
df_group.head(10)

Unnamed: 0,medium,endyear,counts
223,gelatin silver print,1940-1960,3442
177,contact sheet,1940-1960,2711
224,gelatin silver print,1960-1980,2176
222,gelatin silver print,1920-1940,2082
225,gelatin silver print,1980-2000,1654
169,collotype,1880-1900,729
105,albumen print,1860-1880,478
338,inkjet print,2000-2020,415
147,chromogenic print,1980-2000,397
226,gelatin silver print,2000-2020,380


In [7]:
df_gp_year = dataset[['endyear', 'objectid']].groupby(
    ['endyear']).count().reset_index().rename(columns={'objectid': 'counts'})
df_gp_year = df_gp_year.sort_values(by='endyear',ascending=True)
df_gp_year

Unnamed: 0,endyear,counts
0,1820-1840,1
1,1840-1860,512
2,1860-1880,749
3,1880-1900,1330
4,1900-1920,853
5,1920-1940,2449
6,1940-1960,6413
7,1960-1980,3510
8,1980-2000,2724
9,2000-2020,1203


In [8]:
top_list = df_group[df_group['endyear'] == '1860-1880'].sort_values(by='counts', ascending=False)
top3 = top_list['medium'][0:3]
top3

105                      albumen print
110    albumen print (carte-de-visite)
508        stereoscopic albumen prints
Name: medium, dtype: object

In [9]:
top3_data_all = pd.DataFrame()
year_list = df_group['endyear'].unique()

for year in year_list:
    top_list = df_group[df_group['endyear'] == year].sort_values(
        by='counts', ascending=False)
    top3_data_all = pd.concat([top3_data_all, top_list.head(3)])

top3_data_all.reset_index()
top3_data_all

Unnamed: 0,medium,endyear,counts
223,gelatin silver print,1940-1960,3442
177,contact sheet,1940-1960,2711
14,2 1/4 inch gelatin silver prints mounted on board,1940-1960,53
224,gelatin silver print,1960-1980,2176
359,offset lithograph,1960-1980,265
178,contact sheet,1960-1980,254
222,gelatin silver print,1920-1940,2082
366,palladium print,1920-1940,96
390,photogravure,1920-1940,68
225,gelatin silver print,1980-2000,1654


In [10]:
top3_data_all = pd.DataFrame()
year_list = df_group['endyear'].unique()

for year in sorted(year_list):
    top_list = df_group[df_group['endyear'] == year].sort_values(
        by='counts', ascending=False)
    top3_data_all = pd.concat([top3_data_all, top_list.head(3)])

# top3_data_all.reset_index(inplace=True)
top3_data_all = top3_data_all.sort_values(by='endyear')
top3_data_all.reset_index(drop=True, inplace=True)

top3_data_all


Unnamed: 0,medium,endyear,counts
0,photogenic drawing,1820-1840,1
1,albumen print,1840-1860,232
2,salted paper print,1840-1860,137
3,daguerreotype with applied color,1840-1860,35
4,albumen print,1860-1880,478
5,albumen print (carte-de-visite),1860-1880,107
6,stereoscopic albumen prints,1860-1880,23
7,collotype,1880-1900,729
8,platinum print,1880-1900,209
9,albumen print,1880-1900,90


In [11]:
top3_data_all = top3_data_all.rename(columns={'endyear': 'source', 'medium': 'target', 'counts': 'value'})

top3_data_all

Unnamed: 0,target,source,value
0,photogenic drawing,1820-1840,1
1,albumen print,1840-1860,232
2,salted paper print,1840-1860,137
3,daguerreotype with applied color,1840-1860,35
4,albumen print,1860-1880,478
5,albumen print (carte-de-visite),1860-1880,107
6,stereoscopic albumen prints,1860-1880,23
7,collotype,1880-1900,729
8,platinum print,1880-1900,209
9,albumen print,1880-1900,90


In [12]:
# rename source
top3_data_all['source'] = top3_data_all['source'].str.split('-').str[0]

top3_data_all

Unnamed: 0,target,source,value
0,photogenic drawing,1820,1
1,albumen print,1840,232
2,salted paper print,1840,137
3,daguerreotype with applied color,1840,35
4,albumen print,1860,478
5,albumen print (carte-de-visite),1860,107
6,stereoscopic albumen prints,1860,23
7,collotype,1880,729
8,platinum print,1880,209
9,albumen print,1880,90


In [13]:
top3_data_all.to_csv('top3_data_all.csv', index=False)


In [14]:
# rename source
top3_data_all['source'] = top3_data_all['source'].str.split('-').str[0]

top3_data_all

Unnamed: 0,target,source,value
0,photogenic drawing,1820,1
1,albumen print,1840,232
2,salted paper print,1840,137
3,daguerreotype with applied color,1840,35
4,albumen print,1860,478
5,albumen print (carte-de-visite),1860,107
6,stereoscopic albumen prints,1860,23
7,collotype,1880,729
8,platinum print,1880,209
9,albumen print,1880,90


In [15]:
# rename source
top3_data_all['source'] = top3_data_all['source'].str.split('-').str[0]

top3_data_all

Unnamed: 0,target,source,value
0,photogenic drawing,1820,1
1,albumen print,1840,232
2,salted paper print,1840,137
3,daguerreotype with applied color,1840,35
4,albumen print,1860,478
5,albumen print (carte-de-visite),1860,107
6,stereoscopic albumen prints,1860,23
7,collotype,1880,729
8,platinum print,1880,209
9,albumen print,1880,90


In [16]:
top3_data_all.to_csv('top3_data_all.csv', index=False)

In [19]:
import pandas as pd

# 假設你的資料集是一個 CSV 檔案，這裡使用 pandas 讀取資料
# 請替換 'your_dataset.csv' 為實際的檔案名稱和路徑
df = pd.read_csv('Data/photo_cleandata.csv')

# 在標題中包含 "Georgia O'Keeffe" 的次數
georgia_count = df[df['title'].str.contains(
    "Georgia O'Keeffe", case=False)].shape[0]

# 打印出次數
print(f"Georgia O'Keeffe 出現的次數: {georgia_count}")

# 打印出每個媒體對應的標題數量
medium_counts = df[df['title'].str.contains(
    "Georgia O'Keeffe", case=False)].groupby('medium').size()
print("\n各媒體對應的標題數量:")
print(medium_counts)

Georgia O'Keeffe 出現的次數: 331

各媒體對應的標題數量:
medium
gelatin silver print             205
palladium print                  100
palladium print with mercury       3
platinum print                    20
platinum with palladium print      1
silver-platinum print              2
dtype: int64


In [22]:
import pandas as pd

# 假設你的資料集是一個 CSV 檔案，這裡使用 pandas 讀取資料
# 請替換 'your_dataset.csv' 為實際的檔案名稱和路徑
df = pd.read_csv('Data/photo_cleandata.csv')

# 在標題中包含 "portrait" 的次數
portrait_count = df[df['title'].str.contains("portrait", case=False)].shape[0]

# 打印出次數
print(f"portrait 出現的次數: {portrait_count}")

# 獲取每個媒體對應的標題數量
medium_counts = df[df['title'].str.contains(
    "portrait", case=False)].groupby('medium').size()

# 將標題數量由多到少排序
sorted_medium_counts = medium_counts.sort_values(ascending=False)

# 打印出排序後的媒體對應的標題數量
print("\n各媒體對應的標題數量（由多到少排序）:")
print(sorted_medium_counts)

portrait 出現的次數: 676

各媒體對應的標題數量（由多到少排序）:
medium
gelatin silver print                                                                                            423
daguerreotype with applied color                                                                                 31
albumen print (carte-de-visite)                                                                                  30
tintype with applied color                                                                                       25
daguerreotype                                                                                                    21
ambrotype with applied color                                                                                     15
contact sheet                                                                                                    15
salted paper print with applied color                                                                            12
photogravure            