In [None]:
!pip install pandas seaborn matplotlib scikit-learn gensim eli5 tiktoken spacy gensim wordcloud datasets

In [4]:
from datasets import load_dataset

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tiktoken

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, r2_score
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset = load_dataset("yaful/DeepfakeTextDetect")
df = dataset["train"].to_pandas()
df

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human
...,...,...,...
319066,Noisy Intermediate-Scale Quantum (NISQ) machin...,1,sci_gen_human
319067,Recent years have seen rising needs for locati...,1,sci_gen_human
319068,The ongoing neural revolution in machine trans...,1,sci_gen_human
319069,Let D be a set of n pairwise disjoint unit dis...,1,sci_gen_human


In [7]:
def byte_pair_tokenize(doc):
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [None]:
#Séparation du jeu de données:
# - X_train => les données sur lesquelles le modèle va s'entraîner
# - y_train => les sorties attendues pour chaque entrée (de X_train)
# - X_test => les données sur lesquelles les performances du modèle vont être testées
# - y_test => les sorties attendues pour chaque entrée (de X_test)
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=69)

#Création du modèle de Machine Learning
model = make_pipeline(CountVectorizer(tokenizer=byte_pair_tokenize, lowercase=True, ngram_range=(3, 3)), LogisticRegression(random_state=69, solver="liblinear",penalty = 'l2'))

#Entraînement du modèle
model.fit(X_train, y_train)

#Étude des performances du modèle sur des données jamais vues
y_pred_test = model.predict(X_test)
print("Test Set Classification Report:\n", classification_report(y_test, y_pred_test, target_names=['IA', 'Humain']))
print("R2 Score: ", r2_score(y_test, y_pred_test))