In [43]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [2]:
pa_latin_art = pd.read_csv('../data_samples/pa_latin_art.csv')
stored_latin_art = pa_latin_art.where(pa_latin_art.file_downloaded == True).dropna(how='all')
stored_latin_art.reset_index(drop = True, inplace=True)

## Exploratory Data Analysis

### Amount of NA/Null Values per Feature in the Dataset

In [45]:
num_nulls = pd.isna(stored_latin_art)

In [56]:
num_nulls = num_nulls.sum() / len(num_nulls)

In [61]:
num_nulls = num_nulls.sort_values(ascending=False).where(num_nulls > 0).dropna()

In [62]:
num_nulls

assistivetext     1.000000
markings          0.756098
parentid          0.737805
series            0.682927
portfolio         0.539634
provenancetext    0.506098
inscription       0.500000
dimensions        0.307927
ulanid            0.201220
maxpixels         0.152439
lastname          0.009146
dtype: float64

### Number of Artworks in Dataset by Individual Artists, Distribution of Works by Artists in Dataset

In [3]:
works_by_artist = stored_latin_art.groupby('preferreddisplayname').apply(len)

In [4]:
display(works_by_artist.sort_values(ascending=False))
display(works_by_artist.describe())

preferreddisplayname
Méndez, Leopoldo         28
Abularach, Rodolfo       22
Iturbide, Graciela       21
Matta                    18
Álvarez Bravo, Manuel    17
                         ..
Lam, Wifredo              1
Larez, Francisco          1
Anguiano, Raúl            1
Meinel, Javier Silva      1
Lovera, Héctor Rondón     1
Length: 68, dtype: int64

count    68.000000
mean      4.823529
std       5.992386
min       1.000000
25%       1.000000
50%       2.000000
75%       6.000000
max      28.000000
dtype: float64

## Feature Engineering

### Converting titles from Spanish to English to match preloaded english classification model

In [6]:
english_titles = pd.read_csv('en_titles.csv')
english_titles.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
stored_latin_art['en_translation'] = english_titles

In [9]:
text_dict = {}
for entry in stored_latin_art.en_translation:
    entry_words = entry.split(' ')
    for word in entry_words:
        word = word.strip(')([],\./')
        word = word.lower()
        if word in text_dict.keys():
            text_dict[word] += 1
        else:
            text_dict[word] = 1

In [10]:
title_occurrences = pd.Series(text_dict).sort_values(ascending=False)

In [11]:
title_occurrences.index

Index(['the', 'untitled', 'of', 'garden', 'a', 'in', 'woman', 'fontamara',
       'city', 'and',
       ...
       'have', 'photographs', 'blondes', 'world', 'third', 'desolation',
       'bravo', 'álvarez', 'manuel', 'maderista'],
      dtype='object', length=449)

In [12]:
import torch
import torch.nn.functional as fun
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
en_titles = stored_latin_art.en_translation.copy()

In [14]:
import re
def clean_words(text):
    def strip(string):
        return re.sub('(?:^[{\W}]+)|(?:[{\W}]+$)', '', string)
    text_arr = text.split(' ')
    words = []
    for word in text_arr:
        edited = strip(word)
        words.append(edited.lower())
    return ' '.join(words)

In [15]:
#cleaning and formatting the data to give uniformity and clarity
en_titles = en_titles.apply(clean_words)

### Converting text into tokens and making one-hot for word generalization across titles, medium, country origin, and artist name

In [16]:
one_hot_vectorizer = CountVectorizer(binary=True)
one_hot = one_hot_vectorizer.fit_transform(en_titles)

In [17]:
title_oh = pd.DataFrame(one_hot.toarray().tolist(), columns=one_hot_vectorizer.get_feature_names_out())



In [65]:
one_hot = one_hot_vectorizer.fit_transform(stored_latin_art.medium)
medium_oh = pd.DataFrame(one_hot.toarray().tolist(), columns = one_hot_vectorizer.get_feature_names_out())

In [68]:
one_hot = one_hot_vectorizer.fit_transform(stored_latin_art['Country Name'])
country_oh = pd.DataFrame(one_hot.toarray().tolist(), columns = one_hot_vectorizer.get_feature_names_out())

In [70]:
one_hot = one_hot_vectorizer.fit_transform(stored_latin_art['preferreddisplayname'])
artist_oh = pd.DataFrame(one_hot.toarray().tolist(), columns = one_hot_vectorizer.get_feature_names_out())

In [78]:
oh_collection = pd.concat([title_oh, medium_oh, artist_oh, country_oh], axis=1)

In [79]:
oh_collection.head()

Unnamed: 0,17,1962,1972,1977,20,25,5x,abstraction,accident,aid,...,colombia,cuba,guatemala,mexico,nicaragua,of,peru,republic,uruguay,venezuela
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### FE of Numerical Data

### feature containing % dataset created by individual artists

In [84]:
import scipy.stats as stats

In [35]:
percent_by_artist = stored_latin_art.preferreddisplayname.value_counts(normalize='True')

In [87]:
artist_zscore = stats.zscore(percent_by_artist)

In [90]:
stored_latin_art['artist_zscore'] = stored_latin_art.preferreddisplayname.apply(lambda x: artist_zscore[x])

In [37]:
stored_latin_art['percent_by_artist'] = stored_latin_art.preferreddisplayname.apply(lambda x: percent_by_artist[x])

In [91]:
numerical_cols = ['width', 'height', 'maxpixels', 'percent_by_artist', 'artist_zscore']

In [92]:
numerical_data = stored_latin_art.loc[:, numerical_cols]

In [93]:
numerical_data

Unnamed: 0,width,height,maxpixels,percent_by_artist,artist_zscore
0,2963.0,4000.0,640.0,0.003049,-0.642809
1,5297.0,4171.0,640.0,0.018293,0.197787
2,2926.0,4000.0,640.0,0.054878,2.215217
3,4000.0,3101.0,640.0,0.054878,2.215217
4,4000.0,3084.0,640.0,0.054878,2.215217
...,...,...,...,...,...
323,4949.0,7513.0,,0.015244,0.029668
324,2873.0,4000.0,,0.015244,0.029668
325,2726.0,4000.0,,0.015244,0.029668
326,4000.0,3169.0,,0.015244,0.029668


## Creating the Data Pipeline

## Text Classification Model

In [20]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")