In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Open file.
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')

# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

# Get data types.
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [3]:
# Convert to datetime.
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)

# Create year feature.
artworks['YearAcquired'] = artworks.DateAcquired.dt.year

# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables.
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = artworks.Department

In [None]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model.
mlp = MLPClassifier(hidden_layer_sizes=(100,10,5,), activation='logistic', alpha=0.05)
mlp.fit(X, Y)
print(mlp.score(X, Y))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlp, X, Y, cv=5))

0.556396661945
[ 0.55346634  0.56237141  0.58222881  0.52423793  0.52679939]


In [None]:
# Different parameter.
mlp = MLPClassifier(hidden_layer_sizes=(5000,))
mlp.fit(X, Y)
print(mlp.score(X, Y))
print(cross_val_score(mlp, X, Y, cv=5))



0.661093672243
