# Poem: ETL & EDA



# 🎓 Library

In [2]:
# Misc
import glob
import os
import numpy as np
import pandas as pd

# Training
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_array
from sklearn.linear_model import LogisticRegression

DATA = "data/"

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

# Data Retrieval, Processing, and Storage

Each file is a list of poems from an author.

Each poem has these features:
- Titre: The title of the poem
- Texte: the poem itself
- Auteur: The author of the poem
- Creation: The date of creation

## Retrieval

In [None]:
# Get all files: Gives a list of dictionnaries 
all_files = glob.glob(DATA  + "de/*.dill")
poems_list : list[dict]= []

for file in all_files:
    df = pd.read_pickle(file)
    poems_list.append(df)
    

## Processing

In [55]:
def transform(list_poems : list) -> pd.DataFrame:
    """
    Given a list of poems, return a proper DataFrame.
    Keeps the empty lines in the text.

    Args:
        list_poems (list): List of poems, each poem is a list of lines.

    Returns:
        pd.DataFrame: DataFrame
    """

    # Create the dataframe
    df = pd.DataFrame(list_poems).rename(columns={"Titre": "title", "Texte": "text", "Auteur" : "author", "Creation": "creation"})
    # Replace 'None' with NaN
    df["creation"] = df["creation"].replace('None', np.nan)

    prevalent_date = df.mode()["creation"].iloc[0]

    if pd.isna(prevalent_date):
        print("This author: ", df["author"].iloc[0], "has no prevalent date.")
        # For now, we do not keep him
        return pd.DataFrame()
    
    # Creation can be NaN, fill it with the prevalent value
    df["creation"] = df["creation"].fillna(df.mode()["creation"].iloc[0])
    
    return df

In [56]:
poems_df = pd.DataFrame()
for poems in poems_list:

    with pd.option_context("future.no_silent_downcasting", True):
        df = transform(poems)
    poems_df = pd.concat([poems_df, df], ignore_index=True)


This author:  Suppius, Christoph Eusebius has no prevalent date.
This author:  Ebeling, Johann Justus has no prevalent date.
This author:  Candidus, Karl has no prevalent date.
This author:  Schmolck, Benjamin has no prevalent date.
This author:  Knorr, Christian has no prevalent date.
This author:  Plavius, Johannes has no prevalent date.
This author:  Neumark, Georg has no prevalent date.
This author:  Theokrit has no prevalent date.
This author:  Greflinger, Georg has no prevalent date.
This author:  Spindler, Christian Gotthold has no prevalent date.
This author:  Anakreon has no prevalent date.
This author:  Arent, Wilhelm (Hg.) has no prevalent date.
This author:  Nietzsche, Friedrich has no prevalent date.
This author:  Scheyb, Franz Christoph von has no prevalent date.
This author:  Schmeltzl, Wolfgang has no prevalent date.
This author:  Beer, Johann has no prevalent date.
This author:  [anonymous] has no prevalent date.
This author:  Sommer, Elise has no prevalent date.
This 

In [44]:
len(poems_df)

73951

## Storage

In [57]:
poems_df.to_parquet(DATA + "de_poems.parquet", index=False)

# EDA