# Supervised Neural Networks - Playing with layers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import scipy
import sklearn
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
#load data in chunks
t0=time.time()
df=pd.DataFrame()
for chunk in pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv', 
                         chunksize=500000, 
                         low_memory=False):
    df = pd.concat([df,chunk])
t1=time.time()
print('Time to load in chunks: {:.5f}'.format(t1-t0))

Time to load in chunks: 276.84537


In [3]:
print(df.shape)
print(df.columns)

(135804, 29)
Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')


In [4]:
# Select Columns.
df = df[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
df['URL'] = df['URL'].notnull()
df['ThumbnailURL'] = df['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
df = df[df['Department']!='Film']
df = df[df['Department']!='Media and Performance Art']
df = df[df['Department']!='Fluxus Collection']

# Drop missing data.
df = df.dropna()
print(df.head())
print(df.shape)

                     Artist Nationality  Gender  Date             Department  \
0               Otto Wagner  (Austrian)  (Male)  1896  Architecture & Design   
1  Christian de Portzamparc    (French)  (Male)  1987  Architecture & Design   
2                Emil Hoppe  (Austrian)  (Male)  1903  Architecture & Design   
3           Bernard Tschumi          ()  (Male)  1980  Architecture & Design   
4                Emil Hoppe  (Austrian)  (Male)  1903  Architecture & Design   

  DateAcquired   URL  ThumbnailURL  Height (cm)  Width (cm)  
0   1996-04-09  True          True      48.6000    168.9000  
1   1995-01-17  True          True      40.6401     29.8451  
2   1997-01-15  True          True      34.3000     31.8000  
3   1995-01-17  True          True      50.8000     50.8000  
4   1997-01-15  True          True      38.4000     19.1000  
(106031, 10)


In [5]:
df['DateAcquired'] = pd.to_datetime(df.DateAcquired)
df['YearAcquired'] = df.DateAcquired.dt.year
df['YearAcquired'].dtype

dtype('int64')

In [6]:
# Remove multiple nationalities, genders, and artists.
df.loc[df['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
df.loc[df['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
df.loc[df['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'


# Convert dates to start date, cutting down number of distinct examples.
df['Date'] = pd.Series(df.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = df.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(df.Artist)
nationalities = pd.get_dummies(df.Nationality)
dates = pd.get_dummies(df.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = df.Department

# sample down to try and get the model to run
XY = pd.concat([X, df.Department], 1)
XY = XY.sample(frac=0.1, random_state=42)
Xsam = XY.drop('Department', 1)
Ysam = XY.Department

In [11]:
# Alright! We've done our prep, let's build the model.
# Neural networks are hugely computationally intensive.
# This may take several minutes to run.

# Import the model.
from sklearn.neural_network import MLPClassifier

t0=time.time()
# Establish and fit the model, with a single, 1000 + perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,), random_state=42)
mlp.fit(Xsam, Ysam)
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Time: 53.14585


In [13]:
t0=time.time()
print('Score: ',mlp.score(Xsam, Ysam))
print(Ysam.value_counts()/len(Ysam))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlp, Xsam, Ysam, cv=5))
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Score:  0.6134112986890503
Prints & Illustrated Books    0.526266
Photography                   0.228992
Architecture & Design         0.111195
Drawings                      0.100821
Painting & Sculpture          0.032727
Name: Department, dtype: float64
[0.61168709 0.60508954 0.42149929 0.61367925 0.60339943]
Time: 147.44063


##  Play around

In [16]:
# 10 layers of 100
t0=time.time()
# 10 by 100
mlp100 = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100, 
                                        100, 100, 100, 100, 100,), 
                    random_state=42)
mlp100.fit(Xsam, Ysam)
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Time: 39.14951


In [17]:
t0=time.time()
print('Score: ',mlp100.score(Xsam, Ysam))
print(Ysam.value_counts()/len(Ysam))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlp100, Xsam, Ysam, cv=5))
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Score:  0.544845798358955
Prints & Illustrated Books    0.526266
Photography                   0.228992
Architecture & Design         0.111195
Drawings                      0.100821
Painting & Sculpture          0.032727
Name: Department, dtype: float64
[0.65739868 0.66776626 0.65912306 0.63584906 0.62511804]
Time: 363.08619


In [18]:
# logistic
t0=time.time()
# logistic 
mlplog = MLPClassifier(hidden_layer_sizes=(100, ), 
                       random_state=42, 
                       activation='logistic')
mlplog.fit(Xsam, Ysam)
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Time: 10.43977


In [19]:
t0=time.time()
print('Score: ',mlplog.score(Xsam, Ysam))
print(Ysam.value_counts()/len(Ysam))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlplog, Xsam, Ysam, cv=5))
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Score:  0.5932283316042629
Prints & Illustrated Books    0.526266
Photography                   0.228992
Architecture & Design         0.111195
Drawings                      0.100821
Painting & Sculpture          0.032727
Name: Department, dtype: float64
[0.65928369 0.65739868 0.67326733 0.66037736 0.67469311]
Time: 82.43266


In [27]:
# Serial
t0=time.time()

mlpserial = MLPClassifier(hidden_layer_sizes=(100, 90, 80, 70, 60,
                                           50, 40, 30, 20, 10, ), 
                       random_state=42, 
                       activation='logistic')
mlpserial.fit(Xsam, Ysam)
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Time: 9.20899


In [28]:
t0=time.time()
print('Score: ',mlpserial.score(Xsam, Ysam))
print(Ysam.value_counts()/len(Ysam))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlpserial, Xsam, Ysam, cv=5))
print(pd.Series(mlpserial.predict(Xsam)).value_counts())
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Score:  0.5262661510893143
Prints & Illustrated Books    0.526266
Photography                   0.228992
Architecture & Design         0.111195
Drawings                      0.100821
Painting & Sculpture          0.032727
Name: Department, dtype: float64
[0.52591894 0.52591894 0.5261669  0.52641509 0.52691218]
Prints & Illustrated Books    10603
dtype: int64
Time: 41.22320


In [24]:
pd.Series(mlp.predict(Xsam)).value_counts()

Prints & Illustrated Books    10040
Architecture & Design           314
Painting & Sculpture            249
dtype: int64

In [25]:
pd.Series(mlp100.predict(Xsam)).value_counts()

Prints & Illustrated Books    10040
Architecture & Design           314
Painting & Sculpture            249
dtype: int64

In [26]:
pd.Series(mlplog.predict(Xsam)).value_counts()

Prints & Illustrated Books    5599
Photography                   4010
Architecture & Design          767
Painting & Sculpture           227
dtype: int64

In [31]:
# alpha and logistic
t0=time.time()

mlpalpha = MLPClassifier(hidden_layer_sizes=(1000, 100, ), 
                       random_state=42, 
                       activation='logistic', alpha=1e-2)
mlpalpha.fit(Xsam, Ysam)
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

t0=time.time()
print('Score: ',mlpalpha.score(Xsam, Ysam))
print(Ysam.value_counts()/len(Ysam))

from sklearn.model_selection import cross_val_score
print(cross_val_score(mlpalpha, Xsam, Ysam, cv=5))
print(pd.Series(mlpalpha.predict(Xsam)).value_counts())
t1=time.time()
print("Time: {:0.5f}".format(t1-t0))

Time: 31.12621
Score:  0.5439026690559275
Prints & Illustrated Books    0.526266
Photography                   0.228992
Architecture & Design         0.111195
Drawings                      0.100821
Painting & Sculpture          0.032727
Name: Department, dtype: float64
[0.67012253 0.67624882 0.68363979 0.62641509 0.55712937]
Prints & Illustrated Books    9834
Architecture & Design          657
Painting & Sculpture           112
dtype: int64
Time: 907.47797


I don't think I have included enough data in the model for the actual predictions to be accurate.  This model runs to slow for me to really play around with the parameters and including more data.