# Anlysis of museum collection

### Import all the module required

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.tag.stanford import NERTagger

### Import the data from the website

In [None]:
collection_df = pd.read_csv("https://media.githubusercontent.com/media/metmuseum/openaccess/master/MetObjects.csv")
collection_df

## Data clean

The data now looks fine. We intend to do some data clean work as follows:
+ Check "Object Begin Date" and "Object End Date" with "Object Date"
+ Check "Artist Begin Date" and "Artist End Date" with "Artist Display Bio"
+ Check "Artist Display Name" with "Artist Alpha Sort", then delete the later one.
+ Merge "Artist Role" with "Artist Prefix"


In [None]:
## modify the names of columns for indexing convenience
collection_df.columns = ['_'.join(col_name.split()) for col_name in collection_df]

In [None]:
## Check "Object Begin Date" and "Object End Date" with "Object Date"
def check_Object_Date(row):
    return row.Object_End_Date >= row.Object_Begin_Date

def modify_Object_Date(df):
    test_df = df[df.Object_Date.notna()]
    
    for i in test_df.index:
        obdate = df.Object_Date.loc[i]
        year = re.compile("\d+").findall(obdate)
        assert len(year) > 0
        if "-" in obdate:
            if 'B.C.' in obdate:
                bdate = -int(year[0])*100
                edate = -int(year[1])*100
            elif 'century' in obdate:
                bdate = int(year[0])*100
                edate = int(year[1])*100
            elif len(year[0]) == 4 and len(year[1]) == 4:
                bdate = int(year[0])
                edate = int(year[1])
            elif len(year[0]) == 4 and len(year[1]) == 2:
                bdate = int(year[0])
                edate = bdate//100*100 + int(year[1])
            else:
                print('case1',df.loc[i,['Object_Date','Object_Begin_Date','Object_End_Date']],sep = '\n')
                break
        else:
            if 'B.C.' in obdate:
                bdate = -int(year[0])*100 + 1
                edate = bdate + 99
            elif 'century' in obdate:
                bdate = (int(year[0])-1)*100
                edate = bdate + 99
            elif len(year[0]) == 4:
                bdate = edate = int(year[0])
            else:
                print('case2',df.loc[i,['Object_Date','Object_Begin_Date','Object_End_Date']],sep = '\n')
                break
        if df.loc[i].Object_Begin_Date < bdate:
            df.loc[i].Object_Begin_Date = bdate
        if df.loc[i].Object_End_Date > edate:
            df.loc[i].Object_End_Date = edate
    return df

#     a = test_df[ ["century" in date for date in test_df.Object_Date]][['Object_Date','Object_Begin_Date','Object_End_Date']]
#     return a[-a.apply(check_Object_Date,axis = 1)][['Object_Date','Object_Begin_Date','Object_End_Date']]

modify_Object_Date(collection_df)

In [None]:
## Check "Object Begin Date" and "Object End Date" with "Object Date"
def check_Object_Date(row):
    return row.Object_End_Date < row.Object_Begin_Date

def modify_Object_Date(df):
    
    return df[df.apply(check_Object_Date,axis = 1)]

modify_Object_Date(collection_df)[['Object_Date','Object_Begin_Date','Object_End_Date']]

In [None]:
test_df = collection_df[collection_df.Object_Date.notna()]
test_df[["-" in i and len(re.compile("\d+").findall(i)) < 2 for i in test_df.Object_Date]][['Object_Date','Object_Begin_Date','Object_End_Date']]

In [None]:
## Check "Artist Display Name" with "Artist Alpha Sort", then delete the later one
def check_name(row):
    if pd.isna(row.Artist_Alpha_Sort):
        return True
    else:
        r1 = re.findall(r'\w+',row.Artist_Display_Name.lower())
        r2 = re.findall(r'\w+',row.Artist_Alpha_Sort.lower())
        count = 0
        for word in r1:
            if word in r2:
                count += 1
        if count/len(r1) > 0.6:
            return True
        count = 0
        for word in r2:
            if word in r1:
                count += 1
        if count/len(r2) > 0.6:
            return True
        else:
            return False
collection_df['Wrong_Artist_Name'] = collection_df[collection_df.Artist_Display_Name.notna()].apply(check_name,1)
ollection_df.Wrong_Artist_Name.value_counts(dropna=False)

In [None]:
collection_df[['Wrong_Artist_Name','Artist_Display_Name','Artist_Alpha_Sort']].sample(10)

We intend to check number of NaNs in all the columns first and try to ingnore those columns which the number of NaNs is no more than 10%.

In [None]:
collection_df.isna().sum()


In [None]:
collection_df.dtypes

In [None]:
filtered_df =  collection_df.loc[:, collection_df.isna().sum(axis = 0) < len(collection_df.index)*0.1 ]
filtered_df

## Data analysis & Data Visualization

In [None]:
## explore the type of object
top_ten_collections = collection_df.Object_Name.value_counts(dropna=False).head(10)
top_ten_collections

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x = top_ten_collections.values, y = top_ten_collections.index, alpha=0.8, orient='h')
plt.title('Top Ten Collection Types in The Museum')
plt.xlabel('Number of Occurrences', fontsize=12)
plt.ylabel('Collection Types', fontsize=12)
plt.show()

In [None]:
## explore the departement
departments = collection_df.Department.value_counts()
departments

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x = departments.values, y = departments.index, alpha=0.8, orient='h')
plt.title('Quantiles of Collections in Different Departments')
plt.xlabel('Number of Occurrences', fontsize=12)
plt.ylabel('Names of Departments', fontsize=12)
plt.show()

In [None]:
collection_df.Repository.value_counts()

In [None]:
## explore Credit Line
credit_name = collection_df[collection_df.Credit_Line.notna()]['Credit_Line'].str.split(',')
year_counts = {}
contribute_year = []
for i in range(len(credit_name)):
    contribute_year.extend(credit_name[i][-1])
    i += 1
contribute_year
# from collection import Counter
# for key in range(len(contribute_year)):
#     if contribute_year.count(key) > 1:
#         year_counts[key] = contribute_year.count(key)
#         year_counts = sorted(year_counts.items(), key = lambda item: item[0])
# print(contribute_year)

## Analysis conclusion