## utils


In [1]:
import os
import pandas as pd
import csv


def csv_to_dataframes(output='ps'):
    ''' Returns 2 dataframes

    Extracts 1 dataframe with paragraphs and 1 dataframe with
    sentences from a csv file. The csv files names' are parsed
    assuming the following syntax:
    "author_name - title - publication_date.csv"
    '''
    ##########################b#####################
    ###y####  convert csv to df_paragraphs  ########
    ################################################

    # Get csv path ; the csv files are arrays of pre-selected* paragraphs
    # that were extracted from raw txt files by * (cf. Lilou)
    csv_path= "/Users/cyrielle/code/Cyr-dcx/author_style/author_style/data/comp_aut"


    # Create a list of book names
    books = [
        csv_file for csv_file in os.listdir(csv_path)
        if csv_file.endswith('.csv')]


    # Parsing csv file names to get author names, book titles and publishing date
    # and putting these elements in lists that have the same index as the list 'books'
    authors = [csv_file.split(' ')[0]+' '+csv_file.split(' ')[1] for csv_file in books]
    titles = [csv_file.split(' - ')[1] for csv_file in books]
    book_dates = [csv_file.split(' - ')[2].replace('.csv','') for csv_file in books]

    # Initializing a list of dataframes
    dfs = []

    # For each book (in the list 'books'),
    ## 1. create a dataframe with 1 paragraph per row
    ## 2. create columns with fixed values for other features than text
    ## 3. append the dataframe in the list 'dfs' of dataframes
    ## containing the paragraphs from all books

    for book in books:
        ## 1.
        df_temp = pd.read_csv(os.path.join(csv_path,book), header=None)
        ## 2.
        df_temp['author'] = authors[books.index(book)]
        df_temp['title'] = titles[books.index(book)]
        df_temp['book_date'] = book_dates[books.index(book)]
        ## 3.
        dfs.append(df_temp)

    ## Concatenate all dataframes in 'dfs' to get
    ## a single dataframe with paragraphs from all books
    df_paragraphs = pd.concat([df for df in dfs], ignore_index = True, axis=0)
    df_paragraphs.rename(mapper={0:"text"}, axis=1, inplace=True) # NB: The column name for the root_path text is explicitly called in a preprocessing function, it must be 'text'

    ###############y########################################
    ########  convert df_paragraphs to df_sentences  #######
    #######################################b################

    # Initializing a list of dataframes
    dfs = []

    # For each paragraph of our dataset (i.e. for each row in df_paragraph):
    for i in range(df_paragraphs['text'].count()):

        # Separate sentences with '. ' as a delimiter
        # (careful: "J. C.", "Mr.", [...]) ignore ?
        sentences = str(df_paragraphs.text[i]).split(". ")

        # Prepare columns with fixed values for Author_name, Title and Book_date,
        # to assign each sentence of a paragraph to the same Author_name, Title and Book_date.
        author_temp = [df_paragraphs.author[i] for k in range(len(sentences))]
        title_temp = [df_paragraphs.title[i] for k in range(len(sentences))]
        date_temp = [df_paragraphs.book_date[i] for k in range(len(sentences))]

        # Concatenate the 4 previous lists to build a single dataframe
        # containing all sentences of the i-th paragraph of df_paragraphs
        data = [sentences, author_temp, title_temp, date_temp]
        df_temp = pd.DataFrame(data).T

        # Build the list of dataframes containing all sentences of our dataset
        dfs.append(df_temp)

    # Assemble the dataframe containing all sentences of our dataset
    df_sentences = pd.concat(dfs, ignore_index = True, axis=0)
    df_sentences.rename(mapper={0:"text", 1: 'author', 2:'title', 3 : 'book_date'}, axis=1, inplace=True)

    if output == 'p':
        return df_paragraphs
    if output == 's':
        return df_sentences
    if output == 'ps':
        return df_paragraphs, df_sentences



In [2]:
df = csv_to_dataframes(output="p")

In [3]:
df

Unnamed: 0,text,author,title,book_date
0,Pendant que Nous franchissions la porte du Nor...,GUTH Paul,Si j_étais le Bon Dieu,1987
1,"« En 486 après Jésus-Christ, les troupes de Sy...",GUTH Paul,Si j_étais le Bon Dieu,1987
2,Van Eyck présenta La Vierge au chanoine Van de...,GUTH Paul,Si j_étais le Bon Dieu,1987
3,"Un an après l’insolence du soldat, Clovis rass...",GUTH Paul,Si j_étais le Bon Dieu,1987
4,Les hommes se font une idée grotesque du temps...,GUTH Paul,Si j_étais le Bon Dieu,1987
...,...,...,...,...
17774,C’est à bord d’un train de la Southern Pacific...,ECHENOZ Jean,Ravel,2006
17775,"Quelle que soit, pour signer, la solution adop...",ECHENOZ Jean,Ravel,2006
17776,Reste la possibilité d’aller faire un tour dan...,ECHENOZ Jean,Ravel,2006
17777,"Le lendemain matin, il se lève tard, traînant ...",ECHENOZ Jean,Ravel,2006


## preprocessing 

In [4]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import unidecode
#import spacy
import numpy as np
import pandas as pd

#nlp = spacy.load("fr_core_news_sm")


def preprocess(text,
               punctuation=False,
               lower_case=True,
               remove_stopwords=False,
               accents=True,
               numbers=True,
               lemmatize=False,
               language='french'):

    if numbers:
        text = ''.join(char for char in text if not char.isdigit())
    if punctuation:
        text = ''.join(char for char in text if not char in string.punctuation)
    if lower_case:
        text = text.lower()
    if accents:
        text = unidecode.unidecode(text)
    if remove_stopwords:
        stop_words = set(stopwords.words(language))
        word_tokens = word_tokenize(text)
        text = ' '.join(char for char in word_tokens if not char in stop_words)
    if lemmatize:
        text = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        lemmatized = [lemmatizer.lemmatize(char) for char in text]
        text = ' '.join(lemmatized)
    return text


def add_cleaned_column(df):
    df["preprocess_data"] = df['text'].apply(lambda x: preprocess(x))
    return df


"""def return_token(sentence):
    # Tokeniser la phrase
    doc = nlp(sentence)
    # Retourner le texte de chaque token
    return [X.text for X in doc]"""


"""def return_word_embedding(sentence):
    # Vectoriser la phrase
    doc = nlp(sentence)
    # Retourner le vecteur lié à chaque token
    return [(X.vector) for X in doc]"""


def stopword_count(text):
    stop_words = set(stopwords.words('french'))
    word_tokens = word_tokenize(text)
    stopword_count = len([w for w in word_tokens if w in stop_words])
    return stopword_count


def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    if total_length > 0:
        return unique_word_length / total_length
    else:
        return 0

def sentence_count(x):
    if len(x.split()) >0:
        return x.count('.') / len(x.split())
    else:
        return 0


def features(df, output='p'):
    if output=='p':

        df['preprocess_data'] = df['text'].apply(lambda x: preprocess(x))
        df['word_ratio'] = df['text'].apply(lambda x: len(x.split()))
        df['unique_word'] = df['text'].apply(
        lambda x: 0 if len(x.split())==0 else (len(np.unique(x.split()))/ len(x.split())))

        df['sentences_ratio'] = df['text'].apply(lambda x: 0 if len(x.split())==0 else x.count('.') / len(x.split()))
        df['stopwords_ratio'] = df['text'].apply(lambda x: 0 if len(x.split(
    )) == 0 else (stopword_count(x) / len(x.split())))
        df['vocab richness'] = df['text'].apply(vocab_richness)
        return df

    elif output=='s':
        df['preprocess_data'] = df['text'].apply(lambda x: preprocess(x))
        df['word_ratio'] = df['text'].apply(lambda x: len(x.split()))
        df['unique_word_ratio'] = df['text'].apply(
        lambda x: 0 if len(x.split())==0 else (len(np.unique(x.split()))/ len(x.split())))

        df['stopwords_ratio'] = df['text'].apply(lambda x: 0 if len(x.split(
    )) == 0 else (stopword_count(x) / len(x.split())))
        df['vocab richness'] = df['text'].apply(vocab_richness)
        return df


In [5]:
df = features(df, output='p')

In [6]:
df

Unnamed: 0,text,author,title,book_date,preprocess_data,word_ratio,unique_word,sentences_ratio,stopwords_ratio,vocab richness
0,Pendant que Nous franchissions la porte du Nor...,GUTH Paul,Si j_étais le Bon Dieu,1987,pendant que nous franchissions la porte du nor...,235,0.672340,0.051064,0.463830,0.533557
1,"« En 486 après Jésus-Christ, les troupes de Sy...",GUTH Paul,Si j_étais le Bon Dieu,1987,"<< en apres jesus-christ, les troupes de syag...",261,0.685824,0.030651,0.467433,0.546584
2,Van Eyck présenta La Vierge au chanoine Van de...,GUTH Paul,Si j_étais le Bon Dieu,1987,van eyck presenta la vierge au chanoine van de...,244,0.688525,0.049180,0.422131,0.537975
3,"Un an après l’insolence du soldat, Clovis rass...",GUTH Paul,Si j_étais le Bon Dieu,1987,"un an apres l'insolence du soldat, clovis rass...",218,0.848624,0.045872,0.284404,0.658451
4,Les hommes se font une idée grotesque du temps...,GUTH Paul,Si j_étais le Bon Dieu,1987,les hommes se font une idee grotesque du temps...,214,0.649533,0.051402,0.453271,0.533835
...,...,...,...,...,...,...,...,...,...,...
17774,C’est à bord d’un train de la Southern Pacific...,ECHENOZ Jean,Ravel,2006,c'est a bord d'un train de la southern pacific...,70,0.871429,0.028571,0.428571,0.703297
17775,"Quelle que soit, pour signer, la solution adop...",ECHENOZ Jean,Ravel,2006,"quelle que soit, pour signer, la solution adop...",63,0.857143,0.031746,0.412698,0.780822
17776,Reste la possibilité d’aller faire un tour dan...,ECHENOZ Jean,Ravel,2006,reste la possibilite d'aller faire un tour dan...,64,0.828125,0.046875,0.562500,0.619565
17777,"Le lendemain matin, il se lève tard, traînant ...",ECHENOZ Jean,Ravel,2006,"le lendemain matin, il se leve tard, trainant ...",60,0.866667,0.016667,0.383333,0.785714


## pipeline


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = csv_to_dataframes(output='p')
df = features(df, output='p')

#selection de X et y dans le dataframe df
X = df[['preprocess_data','unique_word',
        'word_ratio','sentences_ratio',
        'stopwords_ratio','vocab richness']]
y = df["author"]

# Encode categorical variables
cat_transformer = LabelEncoder()
y = cat_transformer.fit_transform(y)


#split date
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.30, random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha': (0.1,1),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cv_result = cross_val_score(MultinomialNB(),X_combined,y, cv=10, groups=y)


In [None]:
cv_result.mean()

## predict

In [None]:
def csv_to_dataframes_ajar(output='ps'):
    ''' Returns 2 dataframes

    Extracts 1 dataframe with paragraphs and 1 dataframe with
    sentences from a csv file. The csv files names' are parsed
    assuming the following syntax:
    "author_name - title - publication_date.csv"
    '''
    ################################################
    ########  convert csv to df_paragraphs  ########
    ################################################

    # Get csv path ; the csv files are arrays of pre-selected* paragraphs
    # that were extracted from raw txt files by * (cf. Lilou)
    csv_path= "/Users/cyrielle/code/Cyr-dcx/author_style/author_style/data/txt_ajar/"


    # Create a list of book names
    books = [
        csv_file for csv_file in os.listdir(csv_path)
        if csv_file.endswith('.csv')]


    # Parsing csv file names to get author names, book titles and publishing date
    # and putting these elements in lists that have the same index as the list 'books'
    authors = [csv_file.split(' ')[0]+' '+csv_file.split(' ')[1] for csv_file in books]
    titles = [csv_file.split(' - ')[1] for csv_file in books]
    book_dates = [csv_file.split(' - ')[2].replace('.csv','') for csv_file in books]

    # Initializing a list of dataframes
    dfs = []

    # For each book (in the list 'books'),
    ## 1. create a dataframe with 1 paragraph per row
    ## 2. create columns with fixed values for other features than text
    ## 3. append the dataframe in the list 'dfs' of dataframes
    ## containing the paragraphs from all books

    for book in books:
        ## 1.
        df_temp = pd.read_csv(os.path.join(csv_path,book), header=None)
        ## 2.
        df_temp['author'] = authors[books.index(book)]
        df_temp['title'] = titles[books.index(book)]
        df_temp['book_date'] = book_dates[books.index(book)]
        ## 3.
        dfs.append(df_temp)

    ## Concatenate all dataframes in 'dfs' to get
    ## a single dataframe with paragraphs from all books
    df_paragraphs = pd.concat([df for df in dfs], ignore_index = True, axis=0)
    df_paragraphs.rename(mapper={0:"text"}, axis=1, inplace=True) # NB: The column name for the root_path text is explicitly called in a preprocessing function, it must be 'text'

    ########################################################
    ########  convert df_paragraphs to df_sentences  #######
    #######################################b################

    # Initializing a list of dataframes
    dfs = []

    # For each paragraph of our dataset (i.e. for each row in df_paragraph):
    for i in range(df_paragraphs['text'].count()):

        # Separate sentences with '. ' as a delimiter
        # (careful: "J. C.", "Mr.", [...]) ignore ?
        sentences = str(df_paragraphs.text[i]).split(". ")

        # Prepare columns with fixed values for Author_name, Title and Book_date,
        # to assign each sentence of a paragraph to the same Author_name, Title and Book_date.
        author_temp = [df_paragraphs.author[i] for k in range(len(sentences))]
        title_temp = [df_paragraphs.title[i] for k in range(len(sentences))]
        date_temp = [df_paragraphs.book_date[i] for k in range(len(sentences))]

        # Concatenate the 4 previous lists to build a single dataframe
        # containing all sentences of the i-th paragraph of df_paragraphs
        data = [sentences, author_temp, title_temp, date_temp]
        df_temp = pd.DataFrame(data).T

        # Build the list of dataframes containing all sentences of our dataset
        dfs.append(df_temp)

    # Assemble the dataframe containing all sentences of our dataset
    df_sentences = pd.concat(dfs, ignore_index = True, axis=0)
    df_sentences.rename(mapper={0:"text", 1: 'author', 2:'title', 3 : 'book_date'}, axis=1, inplace=True)

    if output == 'p':
        return df_paragraphs
    if output == 's':
        return df_sentences
    if output == 'ps':
        return df_paragraphs, df_sentences


In [None]:
data_ajar = csv_to_dataframes_ajar(output='p')

In [None]:
data_ajar

In [None]:
data_ajar = features(data_ajar, output='p')

In [None]:
X = df[['preprocess_data','unique_word',
        'word_ratio','sentences_ratio',
        'stopwords_ratio','vocab richness']]

# transform X features
column_trans = ColumnTransformer(
    [('vec', TfidfVectorizer(), 'preprocess_data')], remainder='passthrough')

X_combined = column_trans.fit_transform(X[[
    'preprocess_data','unique_word',
    'word_ratio','sentences_ratio',
    'stopwords_ratio','vocab richness'
]])

In [5]:
df.author.value_counts()/pd.DataFrame(y).shape[0]

NameError: name 'y' is not defined

In [None]:
sol = pd.Series(nb_model.predict(X_combined))

In [None]:
sol.value_counts()

In [None]:
cat_transformer.inverse_transform(np.array((10,)))

In [None]:
cat_transformer.inverse_transform(np.array((22,)))

In [None]:
X_test = ("Ils s’arrachaient à leurs armoires à glace où ils étaient en train de scruter leurs visages. Se soulevaient sur leurs lits : « C’est servi, c’est servi », disait-elle. Elle rassemblait à table la famille, chacun caché dans son antre, solitaire, hargneux, épuisé. « Mais qu’ont-ils donc pour avoir l’air toujours vannés ? » disait-elle quand elle parlait à la cuisinière. Elle parlait à la cuisinière pendant des heures, s’agitant autour de la table, s’agitant toujours, préparant des potions pour eux ou des plats, elle parlait, critiquant les gens qui venaient à la maison, les amis : « et les cheveux d’une telle qui vont foncer, ils seront comme ceux de sa mère, et droits ; ils ont de la chance, ceux qui n’ont pas besoin de permanente ». – « Mademoiselle a de beaux cheveux », disait la cuisinière, « ils sont épais, ils sont beaux malgré qu’ils ne bouclent pas ». – « Et un tel, je suis sûre qu’il ne vous a pas laissé quelque chose. Ils sont avares, avares tous, et ils ont de l’argent, ils ont de l’argent, c’est dégoûtant. Et ils se privent de tout. Moi, je ne comprends pas ça. » – « Ah ! non, disait la cuisinière, non, ils ne l’emporteront pas avec eux. Et leur fille, elle n’est toujours pas mariée, et elle n’est pas mal, elle a de beaux cheveux, un petit nez, de jolis pieds aussi. » – « Oui, de beaux cheveux, c’est vrai, disait-elle, mais personne ne l’aime, vous savez, elle ne plaît pas. Ah ! C’est drôle vraiment ».Et il sentait filtrer de la cuisine la pensée humble et crasseuse, piétinante, piétinant toujours sur place, ...")