# Data Preprocessing

This notebook contains the preprocessing of the dataset I collected (1946-1963) and the code used to merge Mona Alaert's dataset (1893-1934), Elisabeth Jansen's dataset (1934-1946) and mine. Ultimately creating a dataset combining all the collected data (1893-1963).

## Cleaning up the data

In [1]:
# Loading my dataset (1946-1963)

import pandas as pd

df = pd.read_csv('dataset.csv', sep=';')
df.sample(10)

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language
1837,Scan 2024-02-26 1157-35.jpeg,1949-1950,,,,ned
60,Scan 2024-02-14 1132-11.jpeg,1946-1947,,,,ned
4211,Scan 2024-03-11 1158-39.jpeg,1953-1954,,,,ned
8777,Scan 2024-03-25 1621-193.jpeg,1959-1960,,,,ned
3749,Scan 2024-02-28 1354-5.jpeg,1952-1953,,,,ned
9862,Scan 2024-04-29 0943-57.jpeg,1960-1961,,,,ned
9180,Scan 2024-03-25 1248-278.jpeg,1959-1960,,,,ned
8168,Scan 2024-03-25 1550-10.jpeg,1959-1960,Fidelio,3-11-1959;5-11-1959,galavoorstelling,ned
10714,Scan 2024-04-29 1150-1.jpeg,1960-1961,,,,ned
3282,Scan 2024-02-28 1142-174.jpeg,1951-1952,,,,ned


In [2]:
# Checking the size of the loaded dataset

print(df.shape)

(12451, 6)


In [3]:
# Removing irrelevant pages, i.e. the production and the date of performance are missing

df = df.dropna(subset=['Production', 'Date'], how='all')
print(f"Dataset now has {len(df)} rows!")

Dataset now has 3366 rows!


In [4]:
# Cells with multiple dates are split so that each row has one date and can be easily converted to datetime
df['Date'] = df['Date'].astype(str)
df['Date'] = df['Date'].str.split(';')
df = df.explode('Date')
df['Date'] = df['Date'].str.strip()

In [5]:
# Converting Date column to datetime

df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

In [6]:
df.sample(10)

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language
3091,Scan 2024-02-28 1109-167.jpeg,1951-1952,Don Pasquale,1952-04-24,,ned
4220,Scan 2024-03-11 1158-48.jpeg,1953-1954,Lohengrin,1954-01-21,,ned
5279,Scan 2024-03-11 1549-378.jpeg,1954-1955,Een Sotternij + Kobolt + Wals + Nonsens + Joha...,1955-06-17,,ned
10976,Scan 2024-04-29 1242-100.jpeg,1960-1961,Cavalleria Rusticana + Paljas,1961-04-15,,ned
11550,Scan 2024-04-29 1403-272.jpeg,1961-1962,Het Driemeisjeshuis,1962-01-07,,ned
238,Scan 2024-02-14 1204-90.jpeg,1946-1947,Madame Butterfly + Haute Couture,1947-02-23,,ned
117,Scan 2024-02-14 1150-18.jpeg,1946-1947,Faust,1946-12-03,,ned
4427,Scan 2024-03-11 1233-104.jpeg,1953-1954,Enoch Arden,1954-04-11,,ned
6898,Scan 2024-03-18 1556-86.jpeg,1957-1958,Lohengrin,1958-02-11,galavoorstelling,ned
9703,Scan 2024-03-25 1354-301.jpeg,1959-1960,De Barbier van Sevilla,1960-06-11,,ned


In [7]:
# Delete trailing and leading whitespaces

df['Image'] = df['Image'].str.strip() 
df['Collection'] = df['Collection'].str.strip() 
df['Production'] = df['Production'].str.strip() 
df['Occasion'] = df['Occasion'].str.strip()
df['Leaflet Language'] = df['Leaflet Language'].str.strip()

In [8]:
# Create a new column (boolean) to indicate whether the performance was part of a multiple bill (True) or not (False)

multiple_bills = []
for row in df['Production']:
    if '+' in str(row):
        multiple_bills.append(True)
    else:
        multiple_bills.append(False)


df['Multiple Bill'] = multiple_bills
df['Multiple Bill'].value_counts()

False    3154
True      444
Name: Multiple Bill, dtype: int64

In [9]:
# Create a new column assigning a random ID to each performance

import random
df['Performance_ID'] = random.sample(range(100000, 999999), df.shape[0])
df.head()

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language,Multiple Bill,Performance_ID
12,Scan 2024-02-14 1120-13.jpeg,1946-1947,Othello,1946-09-28,openingsvoorstelling,ned,False,667941
20,Scan 2024-02-14 1120-21.jpeg,1946-1947,Othello,1946-09-28,openingsvoorstelling,ned,False,661197
22,Scan 2024-02-14 1120-23.jpeg,1946-1947,Traviata,1946-09-29,,ned,False,989553
24,Scan 2024-02-14 1120-25.jpeg,1946-1947,Faust,1946-09-29,,ned,False,170892
27,Scan 2024-02-14 1120-28.jpeg,1946-1947,De Vogelhandelaar,1946-10-01,galavoorstelling,ned,False,909508


In [10]:
# Cells that include a '+' , i.e. a multiple bill are split so that each row only contains one performance
# Using Performance_ID to keep track of those performances that were part of a multiple bill

df['Production'] = df['Production'].astype(str)
df['Production'] = df['Production'].str.split('+')
df = df.explode('Production')
df['Production'] = df['Production'].str.strip()
df = df.reset_index(drop=True)
df.sample(5)

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language,Multiple Bill,Performance_ID
3300,Scan 2024-03-25 1621-143.jpeg,1959-1960,Het Rijngoud,1960-01-23,,ned,False,618117
943,Scan 2024-02-26 1206-9.jpeg,1949-1950,De Toverfluit,1950-06-04,,ned,False,603186
2794,Scan 2024-03-18 1529-119.jpeg,1957-1958,Lakmé,1957-10-29,,ned,False,559814
3437,Scan 2024-03-25 1354-238.jpeg,1959-1960,Het Marialeven,1960-05-31,,ned,False,434883
196,Scan 2024-02-14 1232-19.jpeg,1946-1947,Ballet Suite,1947-05-26,,ned,True,635831


In [11]:
print(f"Dataset now has {len(df)} rows")

Dataset now has 4116 rows


In [12]:
# Spelling normalization: dictionary to keep track of spelling variations

VariatieC = {
    "Bohème" : ["Boheme", "Bohême", "La Boheme"],
    "Het Land van den Glimlach" : ["Het Land van de Glimlach", "Het land van de glimlach"],
    "Norma" : ["La Norma"],
    "De Schim van de Roos" : ["De schim van de Roos"],
    "De Troubadour" : ["Il Trobadore", "Il Trovatore"],
    "Khowanstchina" : ["Chowansjtsjina", "Khovanstsjina"],
    "Taglioni bij Mussette" : ["Taglioni bij Musette"],
    "De Goochelaar van O.L. Vrouw" : ["de goochelaar van O.L. Vrouw"],
    "Pièces Brillantes" : ["Pieces Brillantes"],
    "Prélude à l'après-midi d'un Faune" : ["Prélude à l'après-midi d'un faune", "Prélude à L'Après -midi d'un Faune", "Prélude à L'Après-Midi d'un Faune", "Prélude à L'Après-midi d'un Faune"],
    "Scheherazade" : ["Sheherazade"],
    "De Blauwe Vogel" : ["De blauwe Vogel"],
    "Assepoes" : ["Cinderella"],
    "L'Arlesiana" : ["Arlequinade"],
    "Thaïs" : ["Thais"],
    "De Barbier van Sevilla" : ["Il Barbiere di Seviglia"],
    "Romeo en Julia" : ["Romeo et Juliette", "Romeo en Juliette"],
    "De Graaf van Luxemburg" : ["Graaf van Luxemburg"],
    "Lucia van Lammermoor" : ["Lucie de Lammermoor", "Lucia di Lammermoor"],
    "Cosi fan Tutte" : ["Cosi Fan Tutte"],
    "André Chénier" : ["Andrea Chenier"],
    "La Grande Tentation de Saint-Antoine" : ["De grote verzoeking van Sint Antonius", "De grote Verzoeking van Sint Antonius", "De Grote Verzoeking van Sint Antonius"],
    "Driemeisjeshuis" : ["Drie-Meisjeshuis", "Het Driemeisjeshuis"],
    "Het Gemaskerd Bal" : ["Un Ballo Il Maschera"],
    "Quatuor" : ["Quator"],
    "Tannhauser" : ["Tannhäuser"],
    "Tooverfluit" : ["De Toverfluit", "De Tooverfluit"],
    "Aïda" : ["Aida"],
    "Don Giovanni" : ["Don Juan"],
    "De Parelvisschers" : ["De Parelvissers"],
    "Lakme" : ["Lakmé"],
    "Madame Butterfly" : ["Madama Butterfly"],
    "Salomé" : ["Salome"],
    "Hoffmann's Vertellingen" : ["Hoffmanns Vertellingen", "Hoffmans Vertellingen"],
    "Czaar en Timmerman" : ["Tsaar en Timmerman"],
    "Ballet-suite" : ["Ballet Suite", "Ballet-Suite"],
    "Het Rijngoud" : ["Rijngoud"],
    "Willem Tell" : ["Wilhelm Tell"],
    "De Walkure" : ["De Walküre"],
    "De Doode Oogen" : ["De Dode Ogen"],
    "Boris Godouwnow" : ["Boris Godounow", "Boris Godoenov"],
    "De Vliegende Hollander" : ["De vliegende Hollander"],
    "Namouna" : ["Namouma"],
    "De Wonderbare Mandarijn" : ["De wonderbare Mandarijn"],
    "Tosca" : ["La Tosca"],
    "De Meesterzangers van Nurenberg" : ["De Meesterzangers van Neurenberg"],
    "Thijl Uilenspiegel" : ["Thyl Uilenspiegel"],
    "Traviata" : ["La Traviata"],
    "Czarsdasvorstin" : ["De Czardasvorstin"],
    "De Heilige van Bleecker Street" : ["De Heilige van de Bleecker Street"],
    "In het Witte Paard" : ["In 't Witte Paard"],
    "De Rozenkavalier" : ["De Rozencavalier"],
    "Vierde Symfonie" : ["Vierde Symphonie"],
    "De Regenboog" : ["Regenboog"],
    "Falstaff" : ["Fallstaff"],
    "Het Kasteel van Blauwbaard" : ["Het kasteel van Blauwbaard"],
    "De Antikwaar" : ["L'Antiquaire"],
    "Uitnodiging tot de dans" : ["Uitnodiging tot de Dans"],
    "De Bacchanale" : ["Bacchanale"],
    "Roemeense Rhapsodie" : ["Roemeense Rapsodie"],
    "De Sylphiden" : ["Sylfiden", "De Sylfide", "De Sylfiden"],
    "Jenufa" : ["Jenoefa"],
}

In [13]:
# Function that iterates over the values in the dictionary and returns the key if the title is found in these values
# If the title is not in the dictionary, there is not spelling variation for this title in the dataset

def clean_titles (row):

    for key, values in VariatieC.items():
        if str(row) in str(values):
            #print('it is in the values')
            return key
        elif row == key:
            return key
    else:
        return row 

In [14]:
# Quick test on some frequent variations

a = "Hoffmans Vertellingen"
b = "Tannhäuser"
c = "Salome"

print(clean_titles(a))
print(clean_titles(b))
print(clean_titles(c))

Hoffmann's Vertellingen
Tannhauser
Salomé


In [15]:
# Now applying this function to the entire dataset, the normalized spelling is gathered in a new column called 'Normalized Title'

df['Normalized Title'] = df['Production'].apply(clean_titles)

In [16]:
# Removing duplicates, i.e. the production and date are the same

df = df.drop_duplicates(subset=['Production', 'Date'])
print(f"Dataset now has {len(df)} rows")
df.sample(10)

Dataset now has 4042 rows


Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language,Multiple Bill,Performance_ID,Normalized Title
3047,Scan 2024-03-25 1047-239.jpeg,1958-1959,La Norma,1959-03-17,galavoorstelling,ned,False,526629,Norma
2280,Scan 2024-03-11 1549-371.jpeg,1954-1955,Elektra,1955-06-30,,ned,False,950911,Elektra
744,Scan 2024-02-16 1136-30.jpeg,1948-1949,Paganini,1949-07-21,,ned,False,397056,Paganini
3802,Scan 2024-04-29 1403-328.jpeg,1961-1962,Carmina Burana,1962-02-01,,ned,True,174489,Carmina Burana
2230,Scan 2024-03-11 1549-269.jpeg,1954-1955,Paljas,1955-05-01,,ned,True,180200,Paljas
1326,Scan 2024-02-26 1519-79.jpeg,1951-1952,De Lustige Weduwe,1952-01-05,,ned,False,274171,De Lustige Weduwe
239,Scan 2024-02-14 1232-89.jpeg,1946-1947,Faust,1947-07-13,,ned,False,478135,Faust
815,Scan 2024-02-26 1104-22.jpeg,1949-1950,Peter Grimes,1949-12-15,,ned,False,586159,Peter Grimes
1262,Scan 2024-02-26 1448-25.jpeg,1951-1952,Othello,1951-10-21,,ned,False,350390,Othello
1244,Scan 2024-02-26 1420-37.jpeg,1951-1952,Concert,1951-10-02,,ned,False,742667,Concert


## Adding metadata

Loading a dictionary-formatted txt-file containing additional metadata. This file was created by Mona Allaert and supplemented with new metadata by Elisabeth Jansen and me.
- Key = normalized title
- Value [0] = original language
- Value [1] = composer
- Value [2] = year of first performance
- Value [3] = original title
- Value [4] = genre
- Value [5] = language of normalized title

In [17]:
# Loading the metadata

with open('META3.txt','r', encoding = 'UTF8') as inf:
    Metadata = eval(inf.read())

In [18]:
# Check if the file is formatted correctly

output = []
for key, values in Metadata.items():
    if len(values) != 6:
       output.append(values)
       print(output)

if len(output) == 0:
    print('Looking good!')

Looking good!


In [19]:
# Function that identifies the original language based on the metadata

def add_language (row):
    for key, values in Metadata.items():
        if row == key:
            return values[0]
    else:
        return 'N/A'

# Testing

a = "Madame Butterfly"
b = "Tannhauser"
c = "Quinten Massijs"

print(add_language(a))
print(add_language(b))
print(add_language(c))

ITA
DUI
NL


In [20]:
# Function that identifies the composer based on the metadata

def add_composer (row):
    for key, values in Metadata.items():
        if row == key:
            return values[1]
    else:
        return 'N/A'
    
# Testing

print(add_composer(a))
print(add_composer(b))
print(add_composer(c))

Giacomo Puccini
Richard Wagner
Emile Wambach


In [21]:
# Function that identifies the year of the first performance based on the metadata

def add_premiere (row):
    for key, values in Metadata.items():
        if row == key:
            return values[2]
    else:
        return 'N/A'

print(add_premiere(a))
print(add_premiere(b))
print(add_premiere(c))

1904
1845
1899


In [22]:
# Function that identifies the original title based on the metadata

def add_ortitel (row):
    for key, values in Metadata.items():
        if row == key:
            return values[3]
    else:
        return 'N/A'


print(add_ortitel(a))
print(add_ortitel(b))
print(add_ortitel(c))

Madama Butterfly
Tannhäuser
Quinten Massys


In [23]:
# Function that identifies the genre based on the metadata

def add_genre (row):
    for key, values in Metadata.items():
        if row == key:
            return values[4].lower()
    else:
        return 'N/A'


print(add_genre(a))
print(add_genre(b))
print(add_genre(c))

opera seria
romantische oper
lyrisch drama


In [24]:
# Function that identifies the language of the title on the leaflet based on the metadata

def add_title_lang (row):
    for key, values in Metadata.items():
        if row == key:
            return values[5]
    else:
        return 'N/A'


print(add_title_lang(a))
print(add_title_lang(b))
print(add_title_lang(c))

NL
Unk
Unk


In [25]:
# Add metadata as new columns in the dataframe

df['Original Language'] = df['Normalized Title'].apply(add_language)
df['Composer'] = df['Normalized Title'].apply(add_composer)
df['Original Premiere'] = df['Normalized Title'].apply(add_premiere)
df['Original Title'] = df['Normalized Title'].apply(add_ortitel)
df['Genre'] = df['Normalized Title'].apply(add_genre)
df['Title Language'] = df['Normalized Title'].apply(add_title_lang)

df.sample(10)

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language,Multiple Bill,Performance_ID,Normalized Title,Original Language,Composer,Original Premiere,Original Title,Genre,Title Language
3810,Scan 2024-04-29 1515-15.jpeg,1961-1962,Cosi fan Tutte,1962-02-18,,ned,False,646685,Cosi fan Tutte,ITA,Wolfgang Amadeus Mozart,1794,"Cosi fan tutte, ossia La scuola degli amanti",opera buffa,ITA
3136,Scan 2024-03-25 1139-241.jpeg,1958-1959,Faust,1959-06-18,,ned,False,489145,Faust,FR,Charles Gounod,1859,Faust,grand opera,Unk
1302,Scan 2024-02-26 1519-11.jpeg,1951-1952,Faust,1951-12-02,,ned,False,287729,Faust,FR,Charles Gounod,1859,Faust,grand opera,Unk
2646,Scan 2024-03-18 1401-103.jpeg,1956-1957,Pygmalioon,1957-02-17,,ned,True,967201,Pygmalioon,NL,Johannes den Hertog,1957,Pygmalioon,opera,NL
1404,Scan 2024-02-28 1109-68.jpeg,1951-1952,Malafonte,1952-03-20,,ned,False,772632,Malafonte,?,Angelo Francesco Lavagnino,1952,Malafonte,lyrisch drama,?
3358,Scan 2024-03-25 1248-276.jpeg,1959-1960,De Troubadour,1960-03-24,,ned,False,458410,De Troubadour,ITA,Giuseppe Verdi,1853,Il trovatore,romantische opera,NL
3382,Scan 2024-03-25 1308-48.jpeg,1959-1960,Het Zwanenmeer,1960-04-10,,ned,False,326841,Het Zwanenmeer,RUS,Pjotr Tsjaikovski,1877,Лебединое озеро,ballet,NL
3327,Scan 2024-03-25 1248-13.jpeg,1959-1960,Boheme,1960-02-20,,ned,False,413038,Bohème,ITA,Giacomo Puccini,1897,La bohème,verismo opera,Unk
2538,Scan 2024-03-18 1335-75.jpeg,1956-1957,Lucia van Lammermoor,1956-10-14,,ned,True,973324,Lucia van Lammermoor,ITA,Gaetano Donizetti,1835,Lucia di Lammermoor,opera,NL
815,Scan 2024-02-26 1104-22.jpeg,1949-1950,Peter Grimes,1949-12-15,,ned,False,586159,Peter Grimes,ENG,Benjamin Britten,1945,Peter Grimes,operette,Unk


## More preprocessing...

In [26]:
# Obtaining the season, an operatic season runs (approximately) from September to May
def get_season(date):
    year = date.year
    if date.month >= 9:
        return f"{year}-{year+1}"
    else:
        return f"{year-1}-{year}"


df['Season'] = df['Date'].apply(get_season)

In [27]:
# Create a column with the directors per season

def get_director(season):
    season_directors = {
        '1946-1947' : 'August Baeyens',
        '1947-1948' : 'August Baeyens',
        '1948-1949' : 'Karel Bogaers',
        '1949-1950' : 'Karel Bogaers',
        '1950-1951' : 'Karel Bogaers',
        '1951-1952' : 'Robert Herberigs',
        '1952-1953' : 'Robert Herberigs',
        '1953-1954' : 'August Baeyens',
        '1954-1955' : 'August Baeyens',
        '1955-1956' : 'August Baeyens',
        '1956-1957' : 'August Baeyens',
        '1957-1958' : 'August Baeyens',
        '1958-1959' : 'Mina Bolotine',
        '1959-1960' : 'Mina Bolotine',
        '1960-1961' : 'Mina Bolotine',
        '1961-1962' : 'Renaat Verbruggen',
        '1962-1963' : 'Renaat Verbruggen',
    }
    return season_directors.get(season, 'unk')  # Default to 'unk' if not found

df['Directors'] = df['Season'].apply(get_director)

df.head()

Unnamed: 0,Image,Collection,Production,Date,Occasion,Leaflet Language,Multiple Bill,Performance_ID,Normalized Title,Original Language,Composer,Original Premiere,Original Title,Genre,Title Language,Season,Directors
0,Scan 2024-02-14 1120-13.jpeg,1946-1947,Othello,1946-09-28,openingsvoorstelling,ned,False,667941,Othello,TSJ,Antonín Dvořák,1891,Othello,opera seria,Unk,1946-1947,August Baeyens
2,Scan 2024-02-14 1120-23.jpeg,1946-1947,Traviata,1946-09-29,,ned,False,989553,Traviata,ITA,Giuseppe Verdi,1853,La traviata,opera semiseria,Unk,1946-1947,August Baeyens
3,Scan 2024-02-14 1120-25.jpeg,1946-1947,Faust,1946-09-29,,ned,False,170892,Faust,FR,Charles Gounod,1859,Faust,grand opera,Unk,1946-1947,August Baeyens
4,Scan 2024-02-14 1120-28.jpeg,1946-1947,De Vogelhandelaar,1946-10-01,galavoorstelling,ned,False,909508,De Vogelhandelaar,DUI,Carl Zeller,1891,Der Vogelhändler,operette,NL,1946-1947,August Baeyens
5,Scan 2024-02-14 1120-30.jpeg,1946-1947,Othello,1946-10-03,,ned,False,176997,Othello,TSJ,Antonín Dvořák,1891,Othello,opera seria,Unk,1946-1947,August Baeyens


In [29]:
# Saving the cleaned dataset

# df.to_csv('cleaned_dataset.csv', encoding = 'utf-8')

## Merging datasets

In [30]:
# Loading the cleaned dataset containing Mona's and Elisabeth's data

df_1893_1946 = pd.read_csv("Merged_Dataset.csv", encoding='utf-8', index_col=False)
df_1893_1946.head()

Unnamed: 0.1,Unnamed: 0,Image,Collection,Production,Date,Occasion,Multiple bill,Performances_ID,Normalised title,Original language,Composer,Original premiere,Original title,genre,Season,Directors,Title_Language,Leaflet_Language,Opmerking
0,0,K.V.O. programmas 1893-97_00005-20211201_11335...,1893-1897,De Vrijschutter,1893-10-03,Eerste vertoning,False,903641,De Vrijschutter,DUI,Carl Maria von Weber,1821,Der Freischütz,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,NL,,
1,1,K.V.O. programmas 1893-97_00009-20211201_11343...,1893-1897,Willem Tell,1893-10-05,Eerste vertoning,False,316913,Willem Tell,DUI,Carl Reinecke,1971,Musik zu Schiller’s 'Wilhelm Tell,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,Unk,,
2,2,K.V.O. programmas 1893-97_00011-20211201_11345...,1893-1897,De Vrijschutter,1893-10-10,,False,135474,De Vrijschutter,DUI,Carl Maria von Weber,1821,Der Freischütz,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,NL,,
3,3,K.V.O. programmas 1893-97_00013-20211201_11351...,1893-1897,Willem Tell,1893-10-12,,False,520507,Willem Tell,DUI,Carl Reinecke,1971,Musik zu Schiller’s 'Wilhelm Tell,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,Unk,,
4,4,K.V.O. programmas 1893-97_00015-20211201_11352...,1893-1897,Charlotte Corday,1893-11-16,,False,446314,Charlotte Corday,NL,Peter Benoit,?,Charlotte Corday,lyrisch drama,1893-1894,Edward Keurvels & Henry Fontaine,Unk,,


In [31]:
# Checking shape of their merged dataset

df_1893_1946.shape

(6767, 19)

In [32]:
# Making sure the columns have the same name (column 'opmerking' is removed because it is not relevant for my thesis)

df_1893_1946 = df_1893_1946.rename(columns={"Multiple bill" : "Multiple Bill", 
                                            "Performances_ID" : "Performance_ID", 
                                            "Normalised title" : "Normalized Title",
                                            "Original language" : "Original Language",
                                            "Original premiere" : "Original Premiere",
                                            "Original title" : "Original Title",
                                            "genre" : "Genre",
                                            "Title_Language" : "Title Language",
                                            "Leaflet_Language" : "Leaflet Language"
                                           })

df_1893_1946 = df_1893_1946.drop(columns=["Opmerking"])

df_1893_1946.head()

Unnamed: 0.1,Unnamed: 0,Image,Collection,Production,Date,Occasion,Multiple Bill,Performance_ID,Normalized Title,Original Language,Composer,Original Premiere,Original Title,Genre,Season,Directors,Title Language,Leaflet Language
0,0,K.V.O. programmas 1893-97_00005-20211201_11335...,1893-1897,De Vrijschutter,1893-10-03,Eerste vertoning,False,903641,De Vrijschutter,DUI,Carl Maria von Weber,1821,Der Freischütz,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,NL,
1,1,K.V.O. programmas 1893-97_00009-20211201_11343...,1893-1897,Willem Tell,1893-10-05,Eerste vertoning,False,316913,Willem Tell,DUI,Carl Reinecke,1971,Musik zu Schiller’s 'Wilhelm Tell,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,Unk,
2,2,K.V.O. programmas 1893-97_00011-20211201_11345...,1893-1897,De Vrijschutter,1893-10-10,,False,135474,De Vrijschutter,DUI,Carl Maria von Weber,1821,Der Freischütz,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,NL,
3,3,K.V.O. programmas 1893-97_00013-20211201_11351...,1893-1897,Willem Tell,1893-10-12,,False,520507,Willem Tell,DUI,Carl Reinecke,1971,Musik zu Schiller’s 'Wilhelm Tell,romantische oper,1893-1894,Edward Keurvels & Henry Fontaine,Unk,
4,4,K.V.O. programmas 1893-97_00015-20211201_11352...,1893-1897,Charlotte Corday,1893-11-16,,False,446314,Charlotte Corday,NL,Peter Benoit,?,Charlotte Corday,lyrisch drama,1893-1894,Edward Keurvels & Henry Fontaine,Unk,


In [33]:
# Checking shape of my dataset

df.shape

(4042, 17)

In [34]:
# Combining the dataset

df = pd.concat([df_1893_1946, df], ignore_index=True)
df.sample(10)

Unnamed: 0.1,Unnamed: 0,Image,Collection,Production,Date,Occasion,Multiple Bill,Performance_ID,Normalized Title,Original Language,Composer,Original Premiere,Original Title,Genre,Season,Directors,Title Language,Leaflet Language
8823,,Scan 2024-03-11 1427-150.jpeg,1954-1955,Tosca,1954-11-21 00:00:00,,True,924987,Tosca,ITA,Giacomo Puccini,1900,Tosca,verismo opera,1954-1955,August Baeyens,Unk,ned
4293,4293.0,0036_Untitled document_00037-20230218_103936.jpg,1933-1934,Hoffman’s Vertellingen,1933-10-17,GAKVO-gala,False,691802,Hoffman’s Vertellingen,FR,Jacques Offenbach,1881,Les contes d'Hoffmann,Opéra fantastique,1933-1934,Flor Bosmans,NL,Dutch
9400,,Scan 2024-03-18 1401-138.jpeg,1956-1957,Traviata,1957-03-09 00:00:00,,True,963843,Traviata,ITA,Giuseppe Verdi,1853,La traviata,opera semiseria,1956-1957,August Baeyens,Unk,ned
1789,1789.0,1913-1914 Vlaamse Opera_00037-20211216_164948.jpg,1913-1914,Het Minnebrugje,1913-11-02,,False,313119,Het Minnebrugje,NL,Arthur Van Oost,?,Het Minnebrugje,zangspel,1913-1914,Henry Fontaine,NL,
6253,6253.0,0276_1943-1944_00276-20230306_155447.jpg,1943/1944,Faust,1944-02-10,,False,541220,Faust,FR,Charles Gounod,1859,Faust,grand opera,1943-1944,Joris Diels,Unk,Dutch
6311,6311.0,0378_1943-1944_00378-20230306_162019.jpg,1943/1944,De Lustige Weduwe,1944-04-09,,False,339307,De Lustige Weduwe,DUI,Franz Lehár,1906,Die lustige Witwe,operette,1943-1944,Joris Diels,NL,Dutch
7133,,Scan 2024-02-14 1427-103.jpeg,1947-1948,Italiaans Capriccio,1948-02-14 00:00:00,,True,683284,Italiaans Capriccio,ITA,Peter Tsjaikovsky,1880,Capriccio italien,?,1947-1948,August Baeyens,NL,ned
9932,,Scan 2024-03-25 1453-327.jpeg,1959-1960,Vijfde Symfonie,1959-11-21 00:00:00,,True,634424,Vijfde Symfonie,?,Pjotr Iljitsj Tsjaikovski,1888,?,ballet,1959-1960,Mina Bolotine,NL,ned
8135,,Scan 2024-02-26 1537-91.jpeg,1951-1952,Fedora,1952-02-26 00:00:00,,False,340798,Fedora,ITA,Umberto Giordano,1898,Fedora,?,1951-1952,Robert Herberigs,Unk,ned
1913,1913.0,1918-1919_00020-20220207_094032.jpg,1918-1919,De Bruid der Zee,1918-11-21,,False,309573,De Bruid der Zee,NL,Jan Blockx,1901,De Bruid der Zee,romantische opera,1918-1919,Henry Fontaine,NL,


In [35]:
# Again checking the shape

df.shape

(10809, 18)

In [36]:
# Delete all trailing and leading whitespace

for column in df.columns:
    try:
        df[column] = df[column].str.strip()
    except AttributeError:
        continue

In [37]:
# Convert the date to datetime

df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

In [38]:
# Save merged dataframe (1893-1963)

# df.to_csv('complete_dataset.csv', encoding = 'utf-8')