In [1]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import unidecode
#import spacy
import numpy as np
import pandas as pd

#nlp = spacy.load("fr_core_news_sm")


def preprocess(text,
               punctuation=False,
               lower_case=True,
               remove_stopwords=False,
               accents=True,
               numbers=True,
               lemmatize=False,
               language='french'):

    if numbers:
        text = ''.join(char for char in text if not char.isdigit())
    if punctuation:
        text = ''.join(char for char in text if not char in string.punctuation)
    if lower_case:
        text = text.lower()
    if accents:
        text = unidecode.unidecode(text)
    if remove_stopwords:
        stop_words = set(stopwords.words(language))
        word_tokens = word_tokenize(text)
        text = ' '.join(char for char in word_tokens if not char in stop_words)
    if lemmatize:
        text = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        lemmatized = [lemmatizer.lemmatize(char) for char in text]
        text = ' '.join(lemmatized)
    return text


def add_cleaned_column(df):
    df["preprocess_data"] = df['text'].apply(lambda x: preprocess(x))
    return df


"""def return_token(sentence):
    # Tokeniser la phrase
    doc = nlp(sentence)
    # Retourner le texte de chaque token
    return [X.text for X in doc]"""


"""def return_word_embedding(sentence):
    # Vectoriser la phrase
    doc = nlp(sentence)
    # Retourner le vecteur lié à chaque token
    return [(X.vector) for X in doc]"""


def stopword_count(text):
    stop_words = set(stopwords.words('french'))
    word_tokens = word_tokenize(text)
    stopword_count = len([w for w in word_tokens if w in stop_words])
    return stopword_count


def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length / total_length


def features(df):
    df['preprocess_data'] = df['text'].apply(lambda x: preprocess(x))
    #df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['unique_word_count'] = df['text'].apply(
        lambda x: len(np.unique(x.split()))/ len(x.split()))
    df['sentences_count'] = df['text'].apply(lambda x: x.count('.')/len(x.split()))
    df['stopwords_count'] = df['text'].apply(lambda x: stopword_count(x)/len(x.split()))
    df['vocab richness'] = df['text'].apply(vocab_richness)
    return df


NameError: name '__file__' is not defined

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from author_style.preprocessing import features
from sklearn.preprocessing import LabelEncoder
from author_style.utils import csv_to_dataframes
from sklearn.model_selection import train_test_split


df = csv_to_dataframes(output='p')
df = features(df)

#selection de X et y dans le dataframe df
X = df['preprocess_text','vocab_richness',
       "word_count", "unique_word_count",
       "sentences_count", "stopwords_count"]
y = df["author"]

# Encode categorical variables
cat_transformer = LabelEncoder()
y = cat_transformer.fit_transform(y)

# transform X features
column_trans = ColumnTransformer(
    [('vec', TfidfVectorizer(), 'preprocess_text')], remainder='passthrough')

X_combined = column_trans.fit_transform(X[['preprocess_text','vocab_richness',
                                           "word_count", "unique_word_count",
                                           "sentences_count", "stopwords_count"]])

#split date
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.33, random_state=42)

#model

nb_model = MultinomialNB()
model_trained = nb_model.fit(X_train, y_train)

print(model_trained.score(X_test, y_test))
