#Link to the paper:

https://github.com/DanilSko/opera

###Roadmap, aka what happens below:

1. Metadata collection and enrichment
1. Visualisation trough PCA/t-sne, filtered by corpus
1. Correlation matrix to exclude redundant measures
1. discussion of the most distinctive measures





##Initialisation
### Load libraries

### Standard libraries

In [None]:
import os

In [None]:
import time

In [None]:
import random

In [None]:
import json

In [None]:
import re

In [None]:
import math
from datetime import datetime
import requests

### Other PyPI libraries

In [None]:
# if libraries are not installed, remove the hash from the line starting with '!'
# if you want to reproduce an analysis you can add the version number like this:
# requests==2.25.1 pandas==1.2.3 matplotlib==3.3.4
#! pip install requests pandas matplotlib

In [None]:
!pip install pydracor

In [None]:
import pydracor

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import scipy

In [None]:
import numpy as np

In [None]:
import seaborn as sns

In [None]:
import plotly.express as px

In [None]:
import networkx as nx

In [None]:
from scipy import stats

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.decomposition import PCA

In [None]:
import plotly.io as plt_io
import plotly.graph_objects as go


In [None]:
!pip install kaleido==0.1.0 #static image generation, does not work with the newer kaleido 0.2.1

In [None]:
import kaleido

In [None]:
kaleido.__version__

# I. Data preparation

## 1. Downloading corpora metadata from DraCor

In [None]:
gerdata =  pydracor.Corpus('ger')

In [None]:
german_data = pd.DataFrame(gerdata.metadata())

In [None]:
german_data.head()

In [None]:
german_data.columns

In [None]:
german_data.info()

Initial number of explicitly marked libretti

In [None]:
german_data[german_data['libretto'] == True].shape[0]

Are there any libretti which also have genre marked? Look at the ambiguous German plays:

In [None]:
german_data[~(german_data['normalized_genre'].isna()) & (german_data['libretto'] == True)]

None..

Now same for the French

In [None]:
fre_corpus = pydracor.Corpus('fre')

In [None]:
french_data = pd.DataFrame(fre_corpus.metadata())

In [None]:
french_data = pd.DataFrame(french_data)

In [None]:
french_data.columns

Initial number of explicitly marked libretti

In [None]:
french_data[french_data['libretto'] == True].shape[0]

Are there any libretti which also have genre marked? Look at the ambiguous French plays:

In [None]:
french_data[~(french_data['normalized_genre'].isna()) & (french_data['libretto'] == True)]

In [None]:
noamb = french_data[~(french_data['normalized_genre'].isna()) & (french_data['libretto'] == True)].shape[0]

In [None]:
print(f'There are {noamb} ambigous french plays')

### Preliminary stats: Libretti vs traditional Genres in the unencriched off-the-shelf corpora

In [None]:
sns.set(rc={'figure.figsize':(10, 5.8)})

In [None]:
datatoplot = german_data.apply(lambda x: x['normalized_genre'] if x['libretto'] == False
                  else 'Libretto', axis=1).value_counts()

ax = sns.barplot(x=datatoplot.index, y=datatoplot, palette=['darkblue', 'crimson', 'orange', 'green'])
ax.set(title='German Libretto or Genre initital')

for step, index in enumerate(datatoplot.index):
    ax.text(step, datatoplot.loc[index]+1, str(datatoplot.loc[index]), color='black', ha='center')

In [None]:
datatoplot = french_data.apply(lambda x: x['normalized_genre'] if x['libretto'] == False
                  else 'Libretto', axis=1).value_counts()

ax = sns.barplot(x=datatoplot.index, y=datatoplot, palette=['darkblue', 'crimson',  'green', 'orange'])
ax.set(title='French Libretto or Genre initital')
for step, index in enumerate(datatoplot.index):
    ax.text(step, datatoplot.loc[index]+2, str(datatoplot.loc[index]), color='black', ha='center')

## 2. Enrichment of libretti data

## Uploading libretto corpus from Github

Downloading curated libretti corpus from Github repository


In [None]:
corpus_libretti_curated = pd.read_csv('https://github.com/DanilSko/opera/raw/main/data/curated_libretti.csv')

In [None]:
corpus_libretti_curated.shape

In [None]:
corpus_libretti_curated.head()

In [None]:
corpus_libretti_curated['libretto'].value_counts()

In [None]:
corpus_libretti_curated.columns

## 3. Preprocessing

### Feature selection (new, on the whole data)

In [None]:
## all features available:
german_data.columns

In [None]:
len(german_data.columns)

In [None]:
## features we selected manually as the meaningful ones (dropping year and such stuff)
list_features_pyd = ['num_of_segments', 'num_of_speakers', 'num_of_person_groups',
                      'word_count_sp', 'word_count_stage', 'average_degree', 'density', 'average_clustering',
                      'max_degree', 'num_connected_components', 'diameter', 'average_path_length']

In [None]:
len(list_features_pyd)

## Adding Wikidata genre data

### German wikidata enrichment

 (data pre_scraped and published here: https://github.com/DanilSko/opera/tree/main/data/aux)



Wikidata querying takes time, so here we'll upload wikidata genre IDs from json saved the previous time instead:

In [None]:
path_to_json = 'https://raw.githubusercontent.com/DanilSko/opera/main/data/aux/genre_wikidata_german.json'

saved_wikidata_german = pd.read_json(path_to_json,
                                        typ='series')
german_data['genre_from_wikidata'] = saved_wikidata_german

In [None]:
german_data[german_data['genre_from_wikidata'].fillna('None').str.contains('Q')][['title', 'normalized_genre']]

In [None]:
german_data['genre_from_wikidata'].unique()

In [None]:
german_data['genre_from_wikidata'].value_counts()

In [None]:
wikidata_slice = german_data[(german_data['genre_from_wikidata']
                              .fillna('')
                              .str.contains('Q')) &
            (german_data['libretto'] == False)]

In [None]:
wikidata_slice

### French wikidata enrichment

Updating wikidata genre codes from saved JSON to spare 6 minutes of wikidata queriyng every time

In [None]:
path_to_json = 'https://raw.githubusercontent.com/DanilSko/opera/main/data/aux/genre_wikidata_french.json'

saved_wikidata_french = pd.read_json(path_to_json,
                                        typ='series')
#saved_wikidata_french.columns = ['genre_from_wikidata']
french_data['genre_from_wikidata'] = saved_wikidata_french


In [None]:
french_data['genre_from_wikidata'].unique()

In [None]:
french_data['genre_from_wikidata'].value_counts()

In [None]:
french_data[french_data['genre_from_wikidata'] == 'Q781470'][['name', 'title', 'subtitle', 'genre_from_wikidata']]

In [None]:
french_data['genre_from_wikidata'].unique()

In [None]:
french_data['normalized_genre']

In [None]:
#french_data['wikidata descr'] = french_data['genre_from_wikidata'].apply(get_label_and_description_from_wikidata)

In [None]:
wikidata_slice = french_data[(french_data['genre_from_wikidata']
                              .fillna('')
                              .str.contains('Q')) &
            (french_data['libretto'] == False)]

In [None]:
wikidata_slice

In [None]:
wikigenres = wikidata_slice['genre_from_wikidata'].unique()

In [None]:
wikigenres

### Filling the normalized genre column

#### 1. Uploading the manuallly created mapper table (from wikidata genres to generalized dracor normalised genres:

In [None]:
!wget https://raw.githubusercontent.com/DanilSko/opera/main/data/aux/wikidata_genres_mapping.csv

In [None]:
wd_genre_mapping = pd.read_csv('wikidata_genres_mapping.csv')

In [None]:
wd_genre_mapping.columns

In [None]:
norm_genres = wd_genre_mapping["Normalized genre"]

In [None]:
wd_genre_mapping["Wikidata ID"] = wd_genre_mapping["Wikidata ID"].str.strip()

In [None]:
wd_genre_ids = wd_genre_mapping["Wikidata ID"]

In [None]:
genre_mapper = dict(zip(wd_genre_ids, norm_genres))

In [None]:
genre_mapper

In [None]:
def get_normalized_genre_for_wd(wiki_genre_id):
    if wiki_genre_id in genre_mapper:
        return genre_mapper[wiki_genre_id]
    return None

#### 2. Mapping the French ones:

In [None]:
french_data['genre_from_wikidata'].unique()

In [None]:
french_data['normalized_genre_from_wd'] = french_data['genre_from_wikidata'].apply(get_normalized_genre_for_wd)

In [None]:
french_data['normalized_genre_from_wd'].unique()

In [None]:
french_data.shape

In [None]:
french_data['normalized_genre'].value_counts()

In [None]:
french_data['normalized_genre'] = french_data['normalized_genre'].fillna(french_data['normalized_genre_from_wd'])

In [None]:
french_data['normalized_genre'].value_counts()

#### 3. Mapping the German ones:


In [None]:
german_data['normalized_genre_from_wd'] = german_data['genre_from_wikidata'].apply(get_normalized_genre_for_wd)

In [None]:
german_data['normalized_genre_from_wd'].unique()

In [None]:
german_data[german_data['normalized_genre_from_wd'] == 'Libretto (attributed)'][['name', 'genre_from_wikidata']]

In [None]:
german_data[~(german_data['genre_from_wikidata'].isna()) &
            (german_data['genre_from_wikidata'] != 'No genre on wikidata') &
            (german_data['normalized_genre'].isna())][['first_author',
                                                       'title',
                                                       'normalized_genre',
                                                       'genre_from_wikidata',
                                                       'normalized_genre_from_wd']]

In [None]:
german_data['normalized_genre'].value_counts()

In [None]:
german_data['normalized_genre'] = german_data['normalized_genre'].fillna(german_data['normalized_genre_from_wd'])

In [None]:
german_data['normalized_genre'].value_counts()

In [None]:
german_data[german_data['normalized_genre'] == 'Libretto (attributed)']

### Adding libretti information marked by Luca

In [None]:
libretti_ids = corpus_libretti_curated['id']

In [None]:
libretti_ids

In [None]:
#metadata_df['is_ger'] = metadata_df['id'].str.contains('ger')

In [None]:
#metadata_df[metadata_df['is_ger']].shape[0]

In [None]:
german_data['is_real_libretto'] = german_data['id'].isin(libretti_ids)

In [None]:
french_data['is_real_libretto'] = french_data['id'].isin(libretti_ids)

In [None]:
# how many libretti are there in the german part?
german_data['is_real_libretto'].sum()

In [None]:
# how many libretti are there in the french part?
french_data['is_real_libretto'].sum()

Adding to 'is_real_libretto' the ones we got from wikidata

In [None]:
indices = german_data[german_data['normalized_genre'] == 'Libretto (attributed)'].index

In [None]:
german_data.loc[indices,:]

In [None]:
german_data.loc[indices,'is_real_libretto'] = True

In [None]:
german_data['is_real_libretto'].sum()

same with the french

In [None]:
indices = french_data[french_data['normalized_genre'] == 'Libretto (attributed)'].index

In [None]:
indices

In [None]:
french_data.loc[indices,:]

In [None]:
french_data.loc[indices,'is_real_libretto'] = True

In [None]:
# how many libretti are there in the french part?
french_data['is_real_libretto'].sum()

### making a single column with 'Libretto or Genre' in it

#### 1. German:

In [None]:
german_data['libretto_or_genre'] = german_data.apply(lambda x:
                                                     x['normalized_genre']
                                                     if x['is_real_libretto'] == False
                                                     else 'Libretto', axis=1)

In [None]:
german_data['libretto_or_genre'].value_counts()

In [None]:
german_data['libretto_or_genre'] = german_data['libretto_or_genre'].fillna('Other')

In [None]:
german_data['libretto_or_genre'].value_counts()

In [None]:
german_data['libretto_or_genre'].value_counts().plot.bar()

In [None]:
german_data['libretto_or_genre'] = german_data['libretto_or_genre'].replace('Libretto (attributed)',
                                                                                                          'Libretto')

#### 2. French:

In [None]:
french_data['libretto_or_genre'] = french_data.apply(lambda x:
                                                     x['normalized_genre']
                                                     if x['is_real_libretto'] == False
                                                     else 'Libretto', axis=1)

In [None]:
french_data['libretto_or_genre'].value_counts()

In [None]:
french_data['libretto_or_genre'] = french_data['libretto_or_genre'].fillna('Other')

In [None]:
french_data['libretto_or_genre'].value_counts()

In [None]:
french_data['libretto_or_genre'].value_counts().plot.bar()

### Libretti apartheid: separating libretti Luca marked himself from the 'authority' dracor&wikidata libretti

#### German

In [None]:
german_data['genre_with_putative_libretto'] = german_data.apply(lambda x: 'Libretto (attributed)' if
                  x['is_real_libretto'] is True and x['libretto'] is False
                  else x['libretto_or_genre'], axis=1)

In [None]:
german_data['genre_with_putative_libretto'].value_counts()

In [None]:
german_data['genre_with_putative_libretto'] = german_data['genre_with_putative_libretto'].replace('Libretto','Libretto (DraCor)')

In [None]:
german_data['genre_with_putative_libretto'].value_counts()

In [None]:
x = german_data[german_data['genre_with_putative_libretto'] !='Other'].shape[0]
y = german_data.shape[0]

Share of plays with marked genre / libretto

In [None]:
x/y

### French

In [None]:
french_data['genre_with_putative_libretto'] = french_data.apply(lambda x: 'Libretto (attributed)' if
                  x['is_real_libretto'] is True and x['libretto'] is False
                  else x['libretto_or_genre'], axis=1)

In [None]:
french_data['genre_with_putative_libretto'].value_counts()

In [None]:
french_data['genre_with_putative_libretto'] = french_data['genre_with_putative_libretto'].replace('Libretto', 'Libretto (DraCor)')

In [None]:
french_data['genre_with_putative_libretto'].value_counts()

In [None]:
french_data[french_data['genre_with_putative_libretto'] !='Other'].shape

Share of plays with marked genre / libretto

In [None]:
x = french_data[french_data['genre_with_putative_libretto'] !='Other'].shape[0]
y = french_data.shape[0]

In [None]:
x/y

#### Adding colors according to genres

In [None]:
genres = list(german_data['genre_with_putative_libretto'].unique())

In [None]:
len(genres)

In [None]:
genres

In [None]:
colors = ['white', 'green', 'blue', 'orange', 'yellow', 'red']

In [None]:
genre_color_mapping = zip(genres, colors)

In [None]:
genre_color_mapping = dict(genre_color_mapping)

In [None]:
genre_color_mapping

In [None]:
german_data['color'] = german_data['genre_with_putative_libretto'].apply(lambda x:
                                                              genre_color_mapping[x])

In [None]:
french_data['color'] = french_data['genre_with_putative_libretto'].apply(lambda x:
                                                              genre_color_mapping[x])

### Removing 'Other'

In [None]:
filtered_german_data = german_data[german_data['libretto_or_genre'] != 'Other']
filtered_french_data = french_data[french_data['libretto_or_genre'] != 'Other']

In [None]:
## removing 3 french plays with NaNs
to_remove = filtered_french_data[(filtered_french_data['average_degree'].isna())
& (filtered_french_data['density'].isna())].index

In [None]:
filtered_french_data = filtered_french_data.drop(to_remove)

### Shorter names for dfs

In [None]:
gd = filtered_german_data

In [None]:
fd = filtered_french_data

### Analytics: how much more after enrichment

In [None]:
gd['genre_with_putative_libretto'].value_counts()

In [None]:
newly_added_sum = gd['genre_with_putative_libretto'].value_counts().loc['Libretto (attributed)']

In [None]:
datatoplot = gd['libretto_or_genre'].value_counts()

ax = sns.barplot(x=datatoplot.index, y=datatoplot, palette=['darkblue', 'crimson', 'orange', 'green'])
ax.set(title='German Libretto or Genre after Enrichment')

for step, index in enumerate(datatoplot.index):
    string = ''
    if index == 'Libretto':
        string = f' (+{newly_added_sum})'
    ax.text(step, datatoplot.loc[index]+1, str(datatoplot.loc[index]) + string, color='black', ha='center')

In [None]:
fd['genre_with_putative_libretto'].value_counts()

In [None]:
newly_added_sum_fr = fd['genre_with_putative_libretto'].value_counts().loc['Libretto (attributed)']

In [None]:
datatoplot = fd['libretto_or_genre'].value_counts()

ax = sns.barplot(x=datatoplot.index, y=datatoplot, palette=['darkblue', 'crimson', 'orange', 'green'])
ax.set(title='French Libretto or Genre after Enrichment')

for step, index in enumerate(datatoplot.index):
    string = ''
    if index == 'Libretto':
        string = f' (+{newly_added_sum_fr})'
    ax.text(step, datatoplot.loc[index]+1, str(datatoplot.loc[index]) + string, color='black', ha='center')

### Distinguish comic and non-comic libretti

In [None]:
gd['subtitle'] = gd['subtitle'].fillna('-')

In [None]:
fd['subtitle'] = fd['subtitle'].fillna('-')

In [None]:
gd[gd['subtitle'].str.contains('Oper')][['first_author', 'title', 'subtitle']]

In [None]:
fd[fd['subtitle'].str.contains('Opéra')][['first_author', 'title', 'subtitle']]

In [None]:
list(gd[gd['is_real_libretto']==True]['subtitle'])

In [None]:
list(fd[fd['is_real_libretto']==True]['subtitle'])

In [None]:
def mark_comic_opera(some_subtitle, lang='de'):
    some_subtitle = some_subtitle.lower()
    comic_regex_fr = re.compile('comique|operette|comédie|comedie|vaudevill|divertissement')
    comic_regex_de = re.compile('komisch|operette|komödie|comedie|parodie|posse')
    if lang=='de':
        regex = comic_regex_de
    else:
        regex = comic_regex_fr
    if re.search(regex, some_subtitle) is not None:
        return 'Comic libretto'
    else:
        return 'Non-comic libretto'

In [None]:
gd['libretto_subgenre'] = gd['subtitle'].apply(mark_comic_opera)

In [None]:
gd[gd['is_real_libretto']==False]['libretto_subgenre'] = 'Not libretto'

In [None]:
gd['libretto_subgenre']

In [None]:
gd['genre_with_libretto_subgenres'] = gd.apply(lambda x: x['libretto_or_genre']
                                               if x['is_real_libretto'] == False
                                               else x['libretto_subgenre'],
                                               axis=1)

In [None]:
gd['genre_with_libretto_subgenres'].value_counts()

In [None]:
gd = gd[gd['genre_with_libretto_subgenres'] != 'Libretto']

In [None]:
gd['genre_with_libretto_subgenres'].value_counts()

In [None]:
fd['libretto_subgenre'] = fd['subtitle'].apply(mark_comic_opera, lang='fr')

In [None]:
fd['genre_with_libretto_subgenres'] = fd.apply(lambda x: x['libretto_or_genre']
                                               if x['is_real_libretto'] == False
                                               else x['libretto_subgenre'],
                                               axis=1)

In [None]:
fd['genre_with_libretto_subgenres'].value_counts()

### Color 2 for libretti subgenre

In [None]:
fd.columns

In [None]:
genres_with_subgenres = list(fd['genre_with_libretto_subgenres'].unique())

In [None]:
genres_with_subgenres

In [None]:
colors_sg = ['red', 'blue', 'orange', 'aquamarine', 'green']

In [None]:
genre_color_mapping_sg = dict(zip(genres_with_subgenres, colors_sg))

In [None]:
genre_color_mapping_sg

In [None]:
fd['color_subgenres'] = fd['genre_with_libretto_subgenres'].apply(lambda x:
                                                              genre_color_mapping_sg[x])

In [None]:
gd['color_subgenres'] = gd['genre_with_libretto_subgenres'].apply(lambda x:
                                                              genre_color_mapping_sg[x])

### Checking timeframes

#### German

In [None]:
gd[gd['genre_with_putative_libretto'].str.contains('Libretto')]['year_normalized'].describe()

In [None]:
gd[(gd['genre_with_putative_libretto'].str.contains(
    'Libretto')) & (gd['year_normalized'] == 1770)]

In [None]:
gd[(gd['genre_with_putative_libretto'].str.contains(
    'Libretto')) & (gd['year_normalized'] == 1920)]

#### French

In [None]:
fd[fd['genre_with_putative_libretto'].str.contains('Libretto')]['year_normalized'].describe()

In [None]:
libretto_years = fd[fd['genre_with_putative_libretto'].str.contains('Libretto')]['year_normalized']

In [None]:
libretto_years.max() - libretto_years.min()

In [None]:
fd[(fd['genre_with_putative_libretto'].str.contains(
    'Libretto')) & (fd['year_normalized'] == 1626)]

In [None]:
fd[(fd['genre_with_putative_libretto'].str.contains(
    'Libretto')) & (fd['year_normalized'] == 1889)]

In [None]:
libretto_years.hist(bins=30)

In [None]:
libretto_years.sort_values().to_csv('year_to_see.txt', index=False)

## Creating time slices

#### 1770 - 1819

In [None]:
german_data_1770_1819 = gd[(gd['year_normalized'] >= 1770) &
                                      (gd['year_normalized'] <= 1819)]

In [None]:
german_data_1770_1819['libretto'].value_counts()

In [None]:
german_data_1770_1819['genre_with_putative_libretto'].str.contains('Libretto').sum()

In [None]:
german_data_1770_1819['genre_with_putative_libretto'].value_counts()

#### 1820 - 1869

In [None]:
german_data_1820_1869 = gd[(gd['year_normalized'] >= 1820) &
                                      (gd['year_normalized'] <= 1869)]

In [None]:
german_data_1820_1869['genre_with_putative_libretto'].str.contains('Libretto').sum()

#### 1870 - 1920

In [None]:
german_data_1870_1920 = gd[(gd['year_normalized'] >= 1870) &
                                      (gd['year_normalized'] <= 1920)]

In [None]:
german_data_1870_1920['genre_with_putative_libretto'].str.contains('Libretto').sum()

In [None]:
german_data_1870_1920.columns

In [None]:
german_data_1870_1920[german_data_1870_1920["genre_with_libretto_subgenres"] == 'Non-comic libretto'][['first_author','title', 'word_count_sp']].sort_values('word_count_sp')

In [None]:
def slice_and_get_proportion(data, startyear, endyear):
    data = data[(data['year_normalized'] >= startyear) &
                                      (data['year_normalized'] <= endyear)]
    no_librettti = data['genre_with_putative_libretto'].str.contains('Libretto').sum()
    no_total = data.shape[0]
    no_nonlibretti = no_total - no_librettti
    print(f'Period from {startyear} to {endyear}:')
    print(f'Libretti: {no_librettti}')
    print(f'Other: {no_nonlibretti}')
    return data

In [None]:
french_data_1620_1669 =  slice_and_get_proportion(fd, 1620, 1669)

In [None]:
french_data_1670_1719 = slice_and_get_proportion(fd, 1670, 1719)

In [None]:
french_data_1720_1769 = slice_and_get_proportion(fd, 1720, 1769)

In [None]:
french_data_1770_1819 = slice_and_get_proportion(fd, 1770, 1819)

In [None]:
french_data_1820_1889 = slice_and_get_proportion(fd, 1820, 1889)

## At this point we have our corpus ready for exploration (all genres are mapped)

# II. Data exploration

## 4. Plotting

Folder for pics

In [None]:
if not os.path.exists("output-images"):
    os.mkdir("output-images")

## Trying multidimensional methods (PCA, t-SNE, LDA, UMAP)

#### Preprocessing: data standardization with standard scaler

In [None]:
def standardize(df, feature_list):
    data_to_process = df.loc[:, feature_list].values
    standardized_data_to_process = StandardScaler().fit_transform(data_to_process)
    return standardized_data_to_process

### Common visualization functions

In [None]:
def plot_2d_no_legend(df, component1, component2, output_filename, title):

    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=20,
            color=df['color_subgenres'], #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=False,
            line_width=1
        ),
        text=df['subtitle'],
        #showlegend=True
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title)
    fig.layout.template = 'plotly'

    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

In [None]:
def plot_2d(df, component1, component2, output_filename, title):

    df['comp 1'] = component1
    df['comp 2'] = component2
    pxscatter = px.scatter(df, x='comp 1', y='comp 2',
                           color_discrete_sequence=list(df['color_subgenres'].unique()),
                           color='genre_with_libretto_subgenres',
                           hover_data=['subtitle']

    )


    fig = go.Figure(data = pxscatter)
    fig.update_traces(marker=dict(size=20,
                              line=dict(width=1)),
                      selector=dict(mode='markers'))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title,
                      legend=dict(title="Genre")
                      )
    fig.layout.template = 'plotly'

    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

In [None]:
def plot_2d_no_subgenres(df, component1, component2, output_filename, title):

    df['comp 1'] = component1
    df['comp 2'] = component2
    pxscatter = px.scatter(df, x='comp 1', y='comp 2',
                           color_discrete_sequence=['red','blue', 'orange', 'green'],
                           color='libretto_or_genre',
                           hover_data=['subtitle']

    )

    fig = go.Figure(data = pxscatter)
    fig.update_traces(marker=dict(size=20,
                              line=dict(width=1)),
                                        #color='DarkSlateGrey')),
                      selector=dict(mode='markers'))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title,
                      legend=dict(title="Genre")
                      )
    fig.layout.template = 'plotly'

    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

#### t-SNE

##### German

In [None]:
from sklearn.manifold import TSNE

#### German 1770-1819


In [None]:
def make_tsne(df, feature_list, perp, output_filename, title):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    tsne = TSNE(random_state = 42,
            n_components=3,
            verbose=0,
            perplexity=perp,
            n_iter=400).fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    plot_2d(df, tsne[:, 0], tsne[:, 1], output_filename, title)


In [None]:
make_tsne(german_data_1770_1819, list_features_pyd, 10, "german_data_1770_1819", "german_data_1770_1819")

In [None]:
make_tsne(german_data_1770_1819, list_features_pyd, 50, "german_data_1770_1819-50", "german_data_1770_1819")

In [None]:
make_tsne(german_data_1820_1869, list_features_pyd, 10, "german_data_1820_1869", "german_data_1820_1869")

In [None]:
make_tsne(german_data_1820_1869, list_features_pyd, 50, "german_data_1820_1869-50", "german_data_1820_1869")

In [None]:
make_tsne(german_data_1870_1920, list_features_pyd, 10, "german_data_1820_1869", "german_data_1870_1920")

In [None]:
make_tsne(german_data_1870_1920, list_features_pyd, 50, "german_data_1870_1920-50","german_data_1870_1920")

#### French

In [None]:
make_tsne(french_data_1620_1669, list_features_pyd, 10, "french_data_1620_1669", "french_data_1620_1669")

In [None]:
make_tsne(french_data_1620_1669, list_features_pyd, 50, "french_data_1620_1669-50", "french_data_1620_1669")

In [None]:
make_tsne(french_data_1670_1719, list_features_pyd, 10, "french_data_1670_1719", "french_data_1670_1719")

In [None]:
make_tsne(french_data_1670_1719, list_features_pyd, 50, "french_data_1670_1719-50", "french_data_1670_1719")

In [None]:
make_tsne(french_data_1720_1769, list_features_pyd, 10, "french_data_1720_1769", "french_data_1720_1769")

In [None]:
make_tsne(french_data_1720_1769, list_features_pyd, 50, "french_data_1720_1769-50", "french_data_1720_1769")

In [None]:
make_tsne(french_data_1770_1819, list_features_pyd, 10, "french_data_1770_1819", "french_data_1770_1819")

In [None]:
make_tsne(french_data_1770_1819, list_features_pyd, 50, "french_data_1770_1819-50", "french_data_1770_1819")

In [None]:
make_tsne(french_data_1820_1889, list_features_pyd, 10, "french_data_1820_1889", "french_data_1820_1889")

In [None]:
make_tsne(french_data_1820_1889, list_features_pyd, 50, "french_data_1820_1889-50", "french_data_1820_1889")

In [None]:
# download all pics as zip
#from google.colab import files
#!zip -r /content/output_images.zip /content/output-images
#files.download('/content/output_images.zip')

### UMAP

In [None]:
!pip install umap-learn

In [None]:
import umap

In [None]:
def make_umap(df, feature_list, title, filename):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    reducer = umap.UMAP(random_state=42,n_components=3)
    embedding = reducer.fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    plot_2d(df, reducer.embedding_[:, 0],reducer.embedding_[:, 1], title, filename)


### German

#### German timestamps

In [None]:
make_umap(german_data_1770_1819, list_features_pyd, 'german_data_1770_1819', 'german_data_1770_1819')

In [None]:
make_umap(german_data_1820_1869, list_features_pyd, 'german_data_1820_1869', 'german_data_1820_1869')

In [None]:
make_umap(german_data_1870_1920, list_features_pyd, 'german_data_1870_1920', 'german_data_1870_1920')

### French

In [None]:
make_umap(fd, list_features_pyd, 'french umap', 'french umap')

#### French timestamps

In [None]:
make_umap(french_data_1620_1669, list_features_pyd,'french_data_1620_1669','french_data_1620_1669')

In [None]:
make_umap(french_data_1670_1719, list_features_pyd, 'french_data_1670_1719', 'french_data_1670_1719')

In [None]:
make_umap(french_data_1720_1769, list_features_pyd, 'french_data_1720_1769', 'french_data_1720_1769')

In [None]:
make_umap(french_data_1770_1819, list_features_pyd, 'french_data_1770_1819', 'french_data_1770_1819')

In [None]:
make_umap(french_data_1820_1889, list_features_pyd, 'french_data_1820_1889', 'french_data_1820_1889')

### PCA

### German

In [None]:
def make_pca(df, feature_list, output_filename, title):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    principal = pd.DataFrame(data = principalComponents
                , columns = ['principal component 1', 'principal component 2','principal component 3'])
    plot_2d(df, principalComponents[:, 0],
            principalComponents[:, 1],
            output_filename,
            title)

In [None]:
make_pca(gd, list_features_pyd, 'German PCA', 'German PCA')

#### German timestamps

In [None]:
make_pca(german_data_1770_1819, list_features_pyd, 'pcagerman_data_1770_1819', 'pcagerman_data_1770_1819')

In [None]:
make_pca(german_data_1770_1819, list_features_pyd, '1770_1819allfeatures', '1770_1819allfeatures')

In [None]:
make_pca(german_data_1820_1869, list_features_pyd, 'pcagerman_data_1820_1869', 'pcagerman_data_1820_1869')

In [None]:
make_pca(german_data_1870_1920, list_features_pyd, 'pcagerman_data_1870_1920', 'pcagerman_data_1870_1920')

### French

In [None]:
make_pca(gd, list_features_pyd, 'French PCA', 'French PCA')

#### French timestamps

In [None]:
make_pca(french_data_1620_1669, list_features_pyd, 'pcafrench_data_1620_1669', 'pcafrench_data_1620_1669')

In [None]:
make_pca(french_data_1670_1719, list_features_pyd, 'pcafrench_data_1670_1719', 'pcafrench_data_1670_1719')

In [None]:
make_pca(french_data_1720_1769, list_features_pyd, 'pcafrench_data_1720_1769', 'pcafrench_data_1720_1769')

In [None]:
make_pca(french_data_1770_1819, list_features_pyd, 'pcafrench_data_1770_1819', 'pcafrench_data_1770_1819')

In [None]:
make_pca(french_data_1820_1889, list_features_pyd, 'pcafrench_data_1820_1889', 'pcafrench_data_1820_1889')

### PCA for German and French stuff together!

In [None]:
df_both = pd.concat([gd, fd])

In [None]:
def attach_lang(row):
    #print(some_df.columns)
   #genre = row.loc['genre_with_libretto_subgenres']
    #lang = row.loc['id']
    #lang = some_df['id'].apply(lambda x: x[:3])
    lang_specific_genre = lang + genre
    return lang_specific_genre

In [None]:
df_both['lang'] = df_both['id'].apply(lambda x: x[:3])

In [None]:
df_both['lang_genre_with_libretto_subgenres'] = df_both['lang']+' '+df_both['genre_with_libretto_subgenres']

In [None]:
df_both['lang_genre_with_libretto_subgenres']

In [None]:
df_both['bicolor'] = df_both['lang'].apply(lambda x: 'blue' if x == 'fre' else 'orange')

In [None]:
def make_pca_bilingual(df, feature_list, output_filename, title):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    principal = pd.DataFrame(data = principalComponents
                , columns = ['principal component 1', 'principal component 2','principal component 3'])
    plot_2d_bilingual(df, principalComponents[:, 0],
            principalComponents[:, 1],
            output_filename,
            title)

In [None]:
def plot_2d_bilingual(df, component1, component2, output_filename, title):

    df['comp 1'] = component1
    df['comp 2'] = component2
    pxscatter = px.scatter(df, x='comp 1', y='comp 2',
                           color_discrete_sequence=list(df['bicolor'].unique()), # 'lang_color_subgenres'
                           color = 'lang',
                           #color='lang_genre_with_libretto_subgenres',
                           hover_data=['title']
                           #text='subtitle'

    )

    fig = go.Figure(data = pxscatter)
    fig.update_traces(marker=dict(size=12,
                              line=dict(width=1)),
                                        #color='DarkSlateGrey')),
                      selector=dict(mode='markers'))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title,
                      legend=dict(title="Genre")
                      )
    fig.layout.template = 'plotly'

    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

In [None]:
make_pca_bilingual(df_both, list_features_pyd, 'PCA both corpora', 'PCA both corpora')

In [None]:
df_both['color_subgenres'].unique()

In [None]:
german_recolorer = {'blue':'#6495ED',
                    'red':'#8B0000',
                    'aquamarine':'#00CED1',
                    'green':'#8FBC8F',
                    'orange':'#F0E68C'}

In [None]:
df_both['lang_color_subgenres'] = df_both.apply(lambda x: x['color_subgenres']
                                                if x['lang'] == 'fre'
                                                else german_recolorer[x['color_subgenres']],
                                                axis=1)

In [None]:
def make_pca_bilingual_genre_aware(df, feature_list, output_filename, title, centroids=False):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    principal = pd.DataFrame(data = principalComponents
                , columns = ['principal component 1', 'principal component 2','principal component 3'])
    if centroids:
        plot_2d_bilingual_genre_aware_centroids(df, principalComponents[:, 0],
            principalComponents[:, 1],
            output_filename,
            title)
    else:
        plot_2d_bilingual_genre_aware(df, principalComponents[:, 0],
            principalComponents[:, 1],
            output_filename,
            title)




In [None]:
def plot_2d_bilingual_genre_aware(df, component1, component2, output_filename, title):

    df['comp 1'] = component1
    df['comp 2'] = component2
    pxscatter = px.scatter(df, x='comp 1', y='comp 2',
                           color_discrete_sequence=list(df['lang_color_subgenres'].unique()), # 'lang_color_subgenres'
                           #color = 'lang',
                           color='lang_genre_with_libretto_subgenres',
                           hover_data=['title']
                           #text='subtitle'

    )

    fig = go.Figure(data = pxscatter)
    fig.update_traces(marker=dict(size=12,
                              line=dict(width=1)),
                                        #color='DarkSlateGrey')),
                      selector=dict(mode='markers'))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title,
                      legend=dict(title="Genre")
                      )
    fig.layout.template = 'plotly'

    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

In [None]:
make_pca_bilingual_genre_aware(df_both, list_features_pyd, 'PCA both corpora', 'PCA both corpora')

In [None]:
df_libretti_only = df_both[df_both['is_real_libretto']==True]

In [None]:
df_libretti_only['lang'].value_counts()

In [None]:
onlylibr_recolorer = {'aquamarine':'aquamarine',
                    '#F0E68C':'#FF8C00',
                    '#00CED1':'#FFD700',
                    'orange':'#00CED1'}

In [None]:
df_libretti_only['lang_color_subgenres'] = df_libretti_only['lang_color_subgenres'].apply(lambda x: onlylibr_recolorer[x])

In [None]:
make_pca_bilingual_genre_aware(df_libretti_only, list_features_pyd, 'PCA both corpora libretti only', 'PCA both corpora libretti only')

In [None]:
def make_tsne_bilingual_genre_aware(df, feature_list, perp, output_filename, title):
    standardized_data = standardize(df, feature_list)
    start = time.time()
    tsne = TSNE(random_state = 42,
            n_components=3,
            verbose=0,
            perplexity=perp,
            n_iter=400).fit_transform(standardized_data)
    print('Duration: {} seconds'.format(time.time() - start))
    plot_2d_bilingual_genre_aware(df, tsne[:, 0], tsne[:, 1], output_filename, title)

In [None]:
make_tsne_bilingual_genre_aware(df_libretti_only, list_features_pyd, 25, 't-SNE both corpora libretti only', 't-SNE both corpora libretti only')

In [None]:
def plot_2d_bilingual_genre_aware_centroids(df, component1, component2, output_filename, title):

    df['comp 1'] = component1
    df['comp 2'] = component2
    pxscatter = px.scatter(df, x='comp 1', y='comp 2',
                           color_discrete_sequence=list(df['lang_color_subgenres'].unique()), # 'lang_color_subgenres'
                           #color = 'lang',
                           color='lang_genre_with_libretto_subgenres',
                           hover_data=['title']
                           #text='subtitle'

    )



    fig = go.Figure(data = pxscatter)
    fig.update_traces(marker=dict(size=20,
                              line=dict(width=1)),
                                        #color='DarkSlateGrey')),
                      selector=dict(mode='markers'))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),
                      width=2000,height=1200,
                      font=dict(size=18),
                      title=title,
                      legend=dict(title="Genre")
                      )
    fig.layout.template = 'plotly'


    centroids = df.groupby('lang_genre_with_libretto_subgenres').mean()[['comp 1', 'comp 2']]
    fig.add_trace(
        go.Scatter(
                mode='markers',
                x=centroids['comp 1'],
                y=centroids['comp 2'],
                text=centroids.index,
                marker=dict(
                    color= df.groupby('lang_genre_with_libretto_subgenres').first()['lang_color_subgenres'], #df['lang_color_subgenres'].unique(),
                    size=30,
                    symbol="x",
                    line=dict(
                        color='black',
                        width=2
                    )
                ),
                showlegend=False
            )
    )



    fig.show()
    fig.write_image("output-images/" + output_filename + ".png",scale=2)

In [None]:
make_pca_bilingual_genre_aware(df_libretti_only, list_features_pyd,
                                'PCA both corpora libretti only',
                                'PCA both corpora libretti only',
                               centroids=True)

In [None]:
list(df_libretti_only.groupby('lang_genre_with_libretto_subgenres').first()['lang_color_subgenres'])

## Statistical Testing

### German

In [None]:
def test_significance(data, feature_name, binary_split_criteria):
    A = data[data[binary_split_criteria]==False][feature_name]
    B = data[data[binary_split_criteria]==True][feature_name]
    norm_counter = 0
    for sample in (A,B):
        w, pvalue = stats.shapiro(sample)
        if pvalue > 0.05:
            norm_counter+=1

    if norm_counter == 2:
        #print('Two normal distrs, using unpaired T-test')
        result = stats.ttest_ind(A, B)
    elif norm_counter == 1:
        #print('One distr is normal and one is not!')
        #print('Using Wilcoxon rank sum test')
        result = stats.mannwhitneyu(x=A, y=B, alternative = 'two-sided')
    else:
        #print('Both non-Normal, using Wilcoxon rank sum test')
        result = stats.mannwhitneyu(x=A, y=B, alternative = 'two-sided')

    #if result.pvalue < 0.05:
    #    print('Difference is significant! ✅ pvalue is', result.pvalue)

    #else:
    #    print('Not significant❌, pvalue is', result.pvalue)

    return result.pvalue

    #print(f"{feature_name} & {round(result.pvalue, 20)} & some number \\\\") #format(result.pvalue, '.8f')
    #print(format(result.pvalue, '.10f'))


In [None]:
for feature in list_features_pyd:
    print(feature, test_significance(gd, feature, 'is_real_libretto'))
    #print()

### French

In [None]:
for feature in list_features_pyd:
    print(feature, test_significance(fd, feature, 'is_real_libretto'))

### Automated LATEX table creation for significance test reporting



In [None]:
def bold_tex(pval):
    if pval < 0.05:
        pval = round(pval, 20)
        return "\\textbf{" + str(pval) + "}"
    else:
        pval = round(pval, 2)
        return pval

In [None]:
def test_significance_both(data1, data2, features_list, binary_split_criteria):
    for feature_name in features_list:
        sig_1 = bold_tex(test_significance(data1, feature_name, binary_split_criteria))
        sig_2 = bold_tex(test_significance(data2, feature_name, binary_split_criteria))
        latex_fname = feature_name.replace('_', '\_')
        print(f'{latex_fname} & {sig_1} & {sig_2}\\\\')

In [None]:
test_significance_both(gd,fd,list_features_pyd,'is_real_libretto')

## Scatterplots

FILTER THE SUBSET BY LIBRETTO PERIOD

In [None]:
years = fd[fd['is_real_libretto'] == True]['year_normalized']
earliest_french_libr = years.min()
earliest_french_libr

In [None]:
latest_french_libr = years.max()
latest_french_libr

In [None]:
fd_sc = fd[(fd['libretto_or_genre'] != 'Tragicomedy') &
            (fd['year_normalized'] >= earliest_french_libr) &
            (fd['year_normalized'] <= latest_french_libr)]

In [None]:
fd_sc['year_normalized'].describe()

In [None]:
gyears = gd[gd['is_real_libretto'] == True]['year_normalized']
earliest_g_libr = gyears.min()
earliest_g_libr

In [None]:
latest_g_libr = gyears.max()
latest_g_libr

In [None]:
gd_sc = gd[(gd['libretto_or_genre'] != 'Tragicomedy') &
            (gd['year_normalized'] >= earliest_g_libr) &
            (gd['year_normalized'] <= latest_g_libr)]

In [None]:
gd_sc['year_normalized'].describe()

#### Basic scatterplot

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=gd_sc,
                x='year_normalized',
                y='word_count_sp',
                hue='genre_with_libretto_subgenres')

### lowess ( low-level version of lowess than in seaborn, more complicated implementation, BUT can make only dots transparent)

In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess

In [None]:
def make_lowess(df, column):
    by_year = df.groupby('year_normalized').mean()
    endog = np.array(by_year[column])
    #print(endog)
    exog = np.array(by_year.index)
    #print(exog)

    smooth = lowess(endog, exog)
    index, data = np.transpose(smooth)

    return pd.Series(data, index=index)

In [None]:
def make_scatter_lowess_transp_dots(df, parameter):
    fig = plt.gcf()

    df_no_out = df[(np.abs(stats.zscore(df[parameter])) < 3)]

    scatter_colors = list(df_no_out['color_subgenres'].unique())
    # Change seaborn plot size
    fig.set_size_inches(15, 10)
    sns.scatterplot(data=df_no_out,
                x='year_normalized',
                y=parameter,
                hue='genre_with_libretto_subgenres',
                #height=12, aspect=1.5,
                palette=scatter_colors,
                alpha = 0.2)

    for index, genre in enumerate(df_no_out['genre_with_libretto_subgenres'].unique()):
        lwss = make_lowess(df_no_out[df_no_out['genre_with_libretto_subgenres'] == genre],
                        parameter)
        sns.lineplot(x = lwss.index, y = lwss, color=scatter_colors[index], linewidth = 2)

    plt.savefig(f'{parameter}_scatter_with_lowess.png')

#### german transparent scatterplots

In [None]:
make_scatter_lowess_transp_dots(gd_sc, 'word_count_stage')

In [None]:
make_scatter_lowess_transp_dots(gd_sc, 'num_of_person_groups')

#### french

In [None]:
make_scatter_lowess_transp_dots(fd_sc, 'num_of_speakers')

In [None]:
make_scatter_lowess_transp_dots(fd_sc, 'density')

In [None]:
make_scatter_lowess_transp_dots(fd_sc, 'word_count_stage')

In [None]:
make_scatter_lowess_transp_dots(fd_sc, 'word_count_sp')

## Feature correlations

### Correlation matrices separate for languages

In [None]:
feature_correlations = german_data[list_features_pyd].corr()

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(feature_correlations, annot=True, cmap = 'coolwarm')
plt.savefig('correlation_matrix.png', dpi=300)

french

In [None]:
feature_correlations = french_data[list_features_pyd].corr()

In [None]:
feature_correlations

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(feature_correlations, annot=True, cmap = 'coolwarm')
plt.savefig('correlation_matrix_fre.png', dpi=300)

remove empty `num_of_person_groups`

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(feature_correlations.drop('num_of_person_groups').drop('num_of_person_groups', axis=1), annot=True, cmap = 'coolwarm')
plt.savefig('correlation_matrix_fre.png', dpi=300)

### Getting highly correlated features as list on formal grounds (above 0.75 or below -0.75 correlation)

### German

In [None]:
def get_higlhy_correcated_feature_pairs(series):
    correlated = list(series[series == True].index)
    name = series.name
    try:
        correlated.remove(name)
    except:
        pass
    if len(correlated) > 0:
        for cf in correlated:
            print(f'**{name}** is highly correlated with **{cf}**')


In [None]:
feature_correlations = german_data[list_features_pyd].corr()
colnames  = feature_correlations.columns
hiposcorr = feature_correlations > 0.75
hinegcorr = feature_correlations < -0.75
hicorr = pd.DataFrame(np.nansum([x.values for x in [hiposcorr, hinegcorr]], axis=0))
hicorr.columns = colnames
hicorr.index = colnames
hicorr.apply(get_higlhy_correcated_feature_pairs)

#### Plotting a network of highly correlated features

In [None]:
def plot_corr_graph(hicorrdf):
    graph = nx.from_pandas_adjacency(hicorrdf)
    graph.remove_edges_from(nx.selfloop_edges(graph))
    plt.subplots(figsize=(14,12))
    nx.draw(graph,
        with_labels=True,
        pos=nx.spring_layout(graph))

In [None]:
plot_corr_graph(hicorr)

#### French

In [None]:
feature_correlations = french_data[list_features_pyd].corr()

In [None]:
colnames  = feature_correlations.columns
hiposcorr = feature_correlations > 0.75
hinegcorr = feature_correlations < -0.75
hicorr = pd.DataFrame(np.nansum([x.values for x in [hiposcorr, hinegcorr]], axis=0))
hicorr.columns = colnames
hicorr.index = colnames

In [None]:
hicorr.apply(get_higlhy_correcated_feature_pairs)

#### Plotting a network of highly correlated features

In [None]:
plot_corr_graph(hicorr)

### Drop correlated features

In [None]:
german_features_to_drop = ['average_path_length',
                           'diameter',
                           'max_degree',
                           'num_connected_components']

In [None]:
french_features_to_drop = ['num_of_segments',
                           'average_path_length',
                           'max_degree']

In [None]:
german_features = [feature for feature in list_features_pyd if feature not in german_features_to_drop]

In [None]:
french_features = [feature for feature in list_features_pyd if feature not in french_features_to_drop]

##  Classifier, checking accuracy and getting feature importance

Expecting this to align well with the statistical test above

In [None]:
rstate = 42

In [None]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
from typing import Tuple

In [None]:
import copy as cp

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
def analyze_model_classwise(trained_model, features_val, target_val):
    """this function analyzes classifier models class-wise
    prints confusion matrix and calculates accuracy per class
    """

    with open('new.csv', 'a', encoding='utf-8') as allfile:
        allfile.write(str(trained_model))
        allfile.write('\n')
        predicted_val = trained_model.predict(features_val)
        print(f'accuracy : {accuracy_score(target_val, predicted_val)}')
        allfile.write(f'accuracy \t{"{:.3f}".format(accuracy_score(target_val, predicted_val))} \n')


        labels = trained_model.classes_
        cm = confusion_matrix(target_val, predicted_val, labels=labels)


        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=(labels))
        disp.plot(xticks_rotation = 90)
        class_recalls = list(zip(labels, cm.diagonal()/cm.sum(axis=1)))
        class_precisions = list(zip(labels, cm.diagonal()/cm.sum(axis=0)))
        for i, some_class in enumerate(class_recalls):
            this_class = some_class[0]
            p = class_precisions[i][1]
            r = some_class[1]
            f1 = (2*p*r)/(p+r)
            print(f'precision for class {this_class}: {p}')
            print(f'recall for class {this_class}: {r}')
            print(f'f1 for class {this_class}: {f1}')
            allfile.write(f'precision for class {this_class}\t {"{:.3f}".format(p)} \n')
            allfile.write(f'recall_for_class {this_class}\t{"{:.3f}".format(r)} \n')
            allfile.write(f'f1 for class {this_class}\t{"{:.3f}".format(f1)} \n')

    #print('ROC AUC')
    #draw_roc(target_val, predicted_val)

    plt.show()


In [None]:
def cross_val_predict(model, kfold : KFold, X : np.array, y : np.array) -> Tuple[np.array, np.array, np.array]:

    model_ = cp.deepcopy(model)

    no_classes = len(np.unique(y))

    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes])

    for train_ndx, test_ndx in kfold.split(X):

        train_X, train_y, test_X, test_y = X[train_ndx], y[train_ndx], X[test_ndx], y[test_ndx]

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
        except:
            predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)

    return actual_classes, predicted_classes, predicted_proba

In [None]:
def plot_confusion_matrix(actual_classes : np.array, predicted_classes : np.array, sorted_labels : list, lang='German'):

    matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)

    plt.figure(figsize=(12.8,6))
    sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap="Blues", fmt="g")
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title(f'Combined Confusion Matrix for all folds of the 5-Fold Cross-Validation on {lang} corpus')



    plt.savefig(f'confusion_matrix_{lang}_balanced_kfold_combined.png',
                dpi=300)
    plt.show()

    print(f'accuracy : {accuracy_score(actual_classes, predicted_classes)}')

    cm = matrix
    class_recalls = list(zip(sorted_labels, cm.diagonal()/cm.sum(axis=1)))
    class_precisions = list(zip(sorted_labels, cm.diagonal()/cm.sum(axis=0)))
    for i, some_class in enumerate(class_recalls):
        this_class = some_class[0]
        p = class_precisions[i][1]
        r = some_class[1]
        f1 = (2*p*r)/(p+r)
        print(f'precision for class {this_class}: {p}')
        print(f'recall for class {this_class}: {r}')
        print(f'f1 for class {this_class}: {f1}')

In [None]:
def classify(classifier, df, feature_list, classification_parameter):
    features = standardize(df, feature_list)
    target = df[classification_parameter]
    scores = cross_val_score(classifier, features, target)
    return(scores)

In [None]:
def assess_best_rf(df, features_list, classification_parameter):
    best = 0
    for i in range(100,1000,50):
        model = RandomForestClassifier(n_estimators = i, random_state = rstate)
        cross_val_scores = classify(model, df, features_list, classification_parameter)
        meanscore = np.mean(cross_val_scores)
        print(meanscore)
        if meanscore > best:
            best = meanscore
            best_n_est = i
            best_rf_model = model

    print(f'Best result: {best} Produced by model with {best_n_est} estimators')



In [None]:
def assess_best_brf(df, features_list, classification_parameter):
    best = 0
    for i in range(100,1000,50):
        model = BalancedRandomForestClassifier(n_estimators = i, random_state = rstate)
        cross_val_scores = classify(model, df, features_list, classification_parameter)
        meanscore = np.mean(cross_val_scores)
        print(meanscore)
        if meanscore > best:
            best = meanscore
            best_n_est = i
            best_rf_model = model

    print(f'Best result: {best} Produced by model with {best_n_est} estimators')



In [None]:
def assess_best(df, features_list, classification_parameter, balanced=True):
    best = 0
    for i in range(100,800,50):
        if balanced:
            model = BalancedRandomForestClassifier(n_estimators = i, random_state = rstate)
        else:
            model = RandomForestClassifier(n_estimators = i, random_state = rstate)
        cross_val_scores = classify(model, df, features_list, classification_parameter)
        meanscore = np.mean(cross_val_scores)
        print(meanscore)
        if meanscore > best:
            best = meanscore
            best_n_est = i
            best_rf_model = model
    print(f'Best result: {best} Produced by model with {best_n_est} estimators')
    return best_rf_model

In [None]:
def get_feature_importances(df, good_model, target_col, features_list, corpusname):
    features = standardize(df, features_list)
    target = df[target_col]
    features_train, features_test, target_train, target_test = train_test_split(features,
                                                                                    target,
                                                                                    test_size = 0.3,
                                                                                    random_state=rstate)
    result = permutation_importance(
    good_model, features_test, target_test, n_repeats=10, random_state=rstate, n_jobs=2)
    forest_importances = pd.Series(result.importances_mean, index=features_list)
    fig, ax = plt.subplots()
    forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
    ax.set_title(f"Feature importances using permutation on full model on {corpusname} corpus")
    ax.set_ylabel("Mean accuracy decrease")
    #ax.grid(False)
    ax.set_facecolor('white')
    fig.set_size_inches(10, 6)
    fig.tight_layout()
    plt.savefig('latest_features_imp.png', dpi=300)
    plt.show()


In [None]:
def fully_automated_model_assessment(df,
                                     features_list,
                                     classification_parameter,
                                     balanced=True,
                                     lang='German'):
    model = assess_best(df, features_list, classification_parameter, balanced=balanced)
    df = df.reset_index(drop=True)
    X = standardize(df, features_list)
    y = df[classification_parameter]

    ## cross val conf matrix
    kfold = KFold(n_splits=5, random_state=rstate, shuffle=True)
    class_labels = df[classification_parameter].unique()
    class_labels.sort()
    actual_classes, predicted_classes, _ = cross_val_predict(model, kfold, X, y)
    plot_confusion_matrix(actual_classes, predicted_classes, class_labels, lang=lang)

    ## importances

    features_train, features_test, target_train, target_test = train_test_split(X,
                                                                            y,
                                                                            test_size = 0.3,
                                                                            random_state=rstate)

    model.fit(features_train, target_train)
    get_feature_importances(df,
                            model,
                            classification_parameter,
                            features_list,
                            lang)


In [None]:
import warnings

## Assessing the classifier and producing feature importances

In [None]:
gd_lt = gd[(gd['year_normalized']>1769) & (gd['libretto_or_genre'] != 'Tragicomedy')]

In [None]:
fd_lt = fd[(fd['libretto_or_genre'] != 'Tragicomedy')]

### German binary balanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(gd_lt,
                                 german_features,
                                 'is_real_libretto')

### German binary IMbalanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(gd_lt,
                                 german_features,
                                 'is_real_libretto',
                                    balanced=False)

### French binary balanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(fd_lt,
                                 french_features,
                                 'is_real_libretto',
                                 lang='French')

### French binary IMbalanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(fd_lt,
                                 french_features,
                                 'is_real_libretto',
                                 balanced = False,
                                 lang='French')

### German 4 class balanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(gd_lt,
                                    german_features,
                                    'genre_with_libretto_subgenres')

### German 4 class imbalanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(gd_lt,
                                    german_features,
                                    'genre_with_libretto_subgenres',
                                    balanced=False)

#### French 4 class balanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(fd_lt,
                                 french_features,
                                 'genre_with_libretto_subgenres',
                                 lang='French')

#### French 4 class imbalanced

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    fully_automated_model_assessment(fd_lt,
                                 french_features,
                                 'genre_with_libretto_subgenres',
                                 lang='French',
                                 balanced=False)