In [None]:
#Load the needed libraries
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
import time

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
import collections
import re
from statsmodels.multivariate.manova import MANOVA


from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, LeaveOneOut,  GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, recall_score, classification_report, make_scorer, fbeta_score, confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

import xgboost as xgb
from xgboost import XGBClassifier

#General
import warnings
warnings.filterwarnings('ignore')

RSEED = 42

In [None]:
def largeformat(data, column):
    """
    Reshapes a pandas DataFrame from long to wide format, where each unique value in the specified column becomes a new column,
    and the values in the 'Count' column become the values of the new columns. The resulting DataFrame has one row per unique value
    in the 'Title' column, with the 'Author' and 'Title' columns repeated for each row. Any missing values are filled with 0.

    Args:
    data (pandas.DataFrame): The input DataFrame to reshape.
    column (str): The column name to use for the new columns.

    Returns:
    pandas.DataFrame: The reshaped DataFrame, in wide format.
    """
    variables = list(pd.unique(data[column]))
    dflarge = pd.DataFrame(columns=['Author', 'Title'] + variables)
    plays = list(pd.unique(data['Title']))
    for play in plays:
        subset= data.loc[data['Title'] == play]
        aux = {'Author': [subset['Author'].values[0]], 'Title': [play]}
        for variable in variables:
            if len(subset.loc[subset[column] == variable]) < 1:
                aux[variable] = [0]
            else:
                aux[variable] = subset.loc[subset[column] ==
                                                variable]['Count'].values
        aux = pd.DataFrame(aux).reset_index(drop=True)
        dflarge = pd.concat([dflarge, aux], ignore_index=True)
    return dflarge.convert_dtypes()

In [None]:
def recount(data, column):
    """
    Counts the number of occurrences of each unique value in a specified column of a pandas DataFrame, for each play in the 
    DataFrame. The resulting DataFrame includes the total count and relative count (count/nVerses) for each unique value of 
    the specified column, as well as the total number of verses in each play.

    Args:
    - data (pandas DataFrame): The input DataFrame to be analyzed.
    - column (str): The name of the column to be analyzed.

    Returns:
    - total (pandas DataFrame): A DataFrame with the following columns:
        - Author (str): The author of the play.
        - Title (str): The title of the play.
        - column (str): The unique value of the column for which counts were computed.
        - Count (int): The total number of occurrences of the unique value in the play.
        - RelCount (float): The relative frequency of the unique value in the play (Count/nVerses).
        - nVerses (int): The total number of verses in the play.
    """

    data = data[['Author', 'Title', column]]
    plays = list(pd.unique(data['Title']))
    total = pd.DataFrame(columns=['Author', 'Title', column, 'Count', 'RelCount', 'nVerses'])
    for play in plays:
        subset= data.loc[data['Title'] == play]
        subset.loc[:,('nVerses')] = len(subset)
        values = list(pd.unique(subset[column]))
        for value in values:
            subset.loc[subset[column] == value,
                            'Count'] = len(subset.loc[subset[column] == value])
            subset.loc[:,('RelCount')] = subset['Count']/subset['nVerses']
        total = pd.concat([total, subset])
    total.drop_duplicates(subset=['Title', column], inplace=True)
    return total

In [None]:
def nice_scores(y_train, y_predictions_dict):
    """
    Calculate and return a Pandas DataFrame containing evaluation scores for the given models.

    Args:
    y_train (array-like): The true labels for the training data.
    y_predictions_dict (dict): A dictionary of model names as keys and their predicted labels as values.

    Returns:
    A Pandas DataFrame containing the following columns:
        - 'FBeta': The F-beta score with beta=0.5.
        - 'Accuracy': The accuracy score.
        - 'Recall': The recall score.
        - 'Precision': The precision score.
    The rows of the DataFrame correspond to the model names in the input dictionary.
    """

    df = pd.DataFrame(columns=['FBeta', 'Accuracy', 'Recall', 'Precision'])

    for model_name, predictions in y_predictions_dict.items():
        fbeta = round(fbeta_score(y_train, predictions, beta=0.5), 3)
        accuracy = round(accuracy_score(y_train, predictions), 3)
        recall = round(recall_score(y_train, predictions), 3)
        precision = round(precision_score(y_train, predictions), 3)
        
        df.loc[model_name] = [fbeta, accuracy, recall, precision]

    return df

In [None]:
# Cargo el archivo de entrada
entry = 'data/corpus.csv'
df = pd.read_csv(entry)

In [None]:
#We have a lot of authors in this dataframe but actually just two of them have a similar weight.
import matplotlib as mpl

# Set the font size
mpl.rcParams['font.size'] = 12

plt.figure(figsize=(26,22))
df.groupby('Author')['Title'].nunique().plot(kind='pie', legend =True)
plt.legend(bbox_to_anchor=(1.02, 1.02), loc='upper left')

In [None]:
plt.figure(figsize=(26,12))

ax = sns.countplot(data=df, x='Author')

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

ax.set_title("Number of Verses per Author", fontsize=22)
ax.set_xlabel("Authors", fontsize=18)
ax.set_ylabel("Vers Count", fontsize=18)
plt.show()