# Mueseum of modern art data analysis:

Import neccessary libraries in order to provide connection to postgres database:

In [21]:
import pandas as pd
from assets.sql_wrapper import SQLConnection
import numpy as np
import plotly.express as px

## Data pre processing:

In [2]:
import os
import dotenv

dotenv.load_dotenv(override=True)

username = os.environ['SQL_USERNAME']
host = os.environ['SQL_HOST']
password = os.environ['SQL_PASSWORD']
db = os.environ['DBNAME']
port = os.environ['port']

In [3]:
sql = SQLConnection(db, username, password) ## allows us to perform sql queries on the database

Create a function to list the tables in the database:

In [4]:
def list_tables():
    return sql.q("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE' AND TABLE_SCHEMA='public'")

In [5]:
list_tables()

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,table_name
0,artist
1,artwork


Selecting the first 10 rows of each table so that we may preview some of the data that resides inside the tables:

In [6]:
sql.q("""SELECT * FROM artist LIMIT 10""")

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,artist_id,artist_name,nationality,gender,year_start,year_end
0,304,John Baldessari,American,Male,1931,2020
1,710,Louise Bourgeois,American,Female,1911,2010
2,871,Chris Burden,American,Male,1946,2015
3,1048,Vija Celmins,American,Female,1938,0
4,1156,Chuck Close,American,Male,1940,2021
5,1652,Carroll Dunham,American,Male,1949,0
6,2002,Lee Friedlander,American,Male,1934,0
7,2281,Dan Graham,American,Male,1942,2022
8,2907,Bill Jensen,American,Male,1945,0
9,2923,Jasper Johns,American,Male,1930,0


Here we can see in the artist table we are given their name, nationality, gender and the start and end of their career which are all indexed by an artist_id. This will be useful later as we can try to filter the artists by nationality and gender.

In [7]:
sql.q("""SELECT * FROM artwork LIMIT 10""")

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,artwork_id,title,year_completed,department,artist_id
0,1,Cane,2000,Drawings & Prints,4758
1,2,Untitled,2000,Drawings & Prints,7639
2,3,"The Brown Sisters, Eastham, Massachusetts",2000,Photography,4315
3,4,Untitled (for Parkett no. 59),2000,Drawings & Prints,8330
4,5,Self-Portrait (for Parkett no. 60),2000,Drawings & Prints,1156
5,6,Untitled (Thiers knives III),2000,Drawings & Prints,7447
6,7,Untitled (panhandled colander),2000,Drawings & Prints,7447
7,8,The Man in Black/Drone Harness,2000,Drawings & Prints,7005
8,9,Self-Portrait,2000,Drawings & Prints,1156
9,10,Self-Portrait/Scribble/Etching,2000,Drawings & Prints,1156


In [8]:
sql.q("""SELECT COUNT(artwork_id) FROM artwork""")

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,count
0,6325


Here we can see in the artwork table we are given the title, year completed, department and the artist_id indexed by an artwork_id. We can already see that from the first 10 rows - 4 of the titles are untitled - we shall proceed to remove these as they are not included in visualisations.

In [9]:
sql.q("""SELECT COUNT(*) FROM artwork WHERE title LIKE '%Untitled%'""")

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,count
0,2201


As we can see out of 6325 rows - 2201 rows contain pieces that are untitled.

Selecting only the rows which do not have 'Untitled' in the title:

In [10]:
no_untitled = sql.q("""SELECT * FROM artwork WHERE title NOT LIKE '%Untitled%'""")
no_untitled

  res = pd.read_sql_query(q.strip(), con)


Unnamed: 0,artwork_id,title,year_completed,department,artist_id
0,1,Cane,2000,Drawings & Prints,4758
1,3,"The Brown Sisters, Eastham, Massachusetts",2000,Photography,4315
2,5,Self-Portrait (for Parkett no. 60),2000,Drawings & Prints,1156
3,8,The Man in Black/Drone Harness,2000,Drawings & Prints,7005
4,9,Self-Portrait,2000,Drawings & Prints,1156
...,...,...,...,...,...
4119,6321,#nyc,2018,Photography,132145
4120,6322,#nyc,2018,Photography,132145
4121,6323,#nyc,2018,Photography,132145
4122,6324,#nyc,2018,Photography,132145


## Data processing:

Now we have removed the untitled pieces we may proceed to process the data according to the following requirements:

- View the number of artworks available in the collection
- See a breakdown of artist demographics (i.e. gender, nationality)

### View number of artworks available in the collection:

In [None]:
no_untitled_dup = no_untitled.drop_duplicates(subset='title,', keep="last")

In [20]:
len(no_untitled_dup['title'])

2779

Here we can see using the length of the dataframe with distinct titles there are currently 2779 pieces available in the collection however, more than one department may have worked on the same piece so we must take this into account in the following analysis

#### Number of artworks completed by Nationality (have seperate filters for date range and department)

#### Number of artworks completed by gender (have seperate filters for date range and department)

### See a breakdown of artist demographics:

since the dashboard requires us to filter by department and date range we will also take this into account and create graphs for each:

In [23]:
artist_db = sql.q("""SELECT * FROM artist""")

  res = pd.read_sql_query(q.strip(), con)


#### Proportion of artist gender

In [42]:
male_artists = artist_db[artist_db['gender'] == 'Male']
female_artists = artist_db[artist_db['gender'] == 'Female']

m_f_artists = [len(male_artists), len(female_artists)]
m_f_list = ['Male', 'Female']


In [45]:
fig = px.pie(values=m_f_artists, names=m_f_list, title='Proportion of artist gender', color_discrete_sequence=['#4E0250','#8fe388'])

In [46]:
fig.show()

#### Artist nationality:

In [57]:
artist_nationality = artist_db.groupby('nationality').count()

artist_nationality

nationality_quantity = px.bar(
                    artist_nationality,
                    x=artist_nationality.index,
                    y='artist_name', 
                    color=artist_nationality.index,
                    color_discrete_sequence=['#4E0250','#8fe388']
                )

##### Total artist nationality

In [58]:
nationality_quantity.show()

##### Artist nationality filtered by department

##### Artist nationality filtered by date range