In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

import re

In [9]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eliana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eliana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def lemma_tokenize(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in word_tokenize(doc)]

def preprocess(text):
  text = str(text).lower().strip() #tester aussi sans lower pour voir la différence
  tokens = []
  for token in text.split():
    if token not in en_stop:
      tokens.append(token)
  return " ".join(tokens)

def byte_pair_tokenize(doc):
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [11]:
dataset = load_dataset("yaful/DeepfakeTextDetect")
df = dataset["train"].to_pandas()
df.head()

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human


In [12]:
text_list = list(df["text"])
print("nombre de phrases:", len(text_list))

nombre de phrases: 319071


In [13]:
text_strings = " ".join(text_list)
text_tokens = word_tokenize(text_strings)
print("nombre de token word_tokenize", len(text_tokens))
print("nombre de token word_tokenize unique", len(set(text_tokens)))

nombre de token word_tokenize 77635060
nombre de token word_tokenize unique 561754


In [14]:
text_tokens = lemma_tokenize(text_strings)
print("nombre de token lemma_tokenize", len(text_tokens))
print("nombre de token lemma_tokenize unqiues", len(set(text_tokens)))

nombre de token lemma_tokenize 77635060
nombre de token lemma_tokenize unqiues 546609


In [15]:
text_tokens = byte_pair_tokenize(text_strings)
print("nombre de token byte_pair", len(text_tokens))
print("nombre de token byte_pair uniques", len(set(text_tokens)))

nombre de token byte_pair 82438389
nombre de token byte_pair uniques 64150


In [16]:
text_tokens = byte_pair_tokenize(preprocess(text_strings))
print("nombre de token byte_pair without stop_words", len(text_tokens))
print("nombre de token byte_pair without stop_words uniques", len(set(text_tokens)))

nombre de token byte_pair without stop_words 50537766
nombre de token byte_pair without stop_words uniques 42906


In [17]:
text_tokens = lemma_tokenize(preprocess(text_strings))
print("nombre de token lemma_tokenize without stop_words", len(text_tokens))
print("nombre de token lemma_tokenize without stop_words uniques", len(set(text_tokens)))

nombre de token lemma_tokenize without stop_words 44907900
nombre de token lemma_tokenize without stop_words uniques 469820
