In [1]:
import os
import pandas as pd
import s3fs
import zipfile
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Construct the S3 endpoint URL
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [3]:
# List files from challenge
fs.ls("civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv")

['civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv']

In [4]:
# Download datas from the service
PATH_IN = 'civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv'
fs.download(PATH_IN, 'data/hack_train.csv')

[None]

In [5]:
# Read the csv
df = pd.read_csv('data/hack_train.csv')
df

Unnamed: 0,text,label,src
0,Little disclaimer: this deals with US laws and...,1,cmv_human
1,"Read: Mentally Retarded Downs. See, we've got ...",1,cmv_human
2,"If any of you frequent rbadhistory, there is a...",1,cmv_human
3,"I believe in a flat tax system, where everyone...",1,cmv_human
4,"Edit: Ok guy's, my views have been changed on ...",1,cmv_human
...,...,...,...
56814,We consider the recovery of a source term f (x...,1,sci_gen_human
56815,"Self-supervised learning (SlfSL), aiming at le...",1,sci_gen_human
56816,Recurrent neural networks (RNNs) have achieved...,1,sci_gen_human
56817,Deep reinforcement learning (DRL) is a booming...,1,sci_gen_human


In [6]:
# Create a Dataset from the data
data_dict = {
    "texts": df['text'],
    "labels": df['label']
}
hf_dataset = Dataset.from_dict(data_dict)

In [7]:
# Retrieve the texts and labels from the dataset.
texts = hf_dataset["texts"]
labels = hf_dataset["labels"]

# Vectorize the texts, which converts the text data into numerical feature vectors.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, np.array(labels), test_size=0.2, random_state=42)

# Create and train the Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Model Evaluation
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Précision du modèle Naïf Bayésien : {:.2f}%".format(accuracy * 100))


Précision du modèle Naïf Bayésien : 73.94%


In [8]:
# Predict on set test
y_pred = clf.predict(X_test)

# Print the confusion matrix and the classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Matrice de confusion:\n", conf_matrix)
print("\nRapport de classification:\n", class_report)


Matrice de confusion:
 [[4714  838]
 [2124 3688]]

Rapport de classification:
               precision    recall  f1-score   support

           0       0.69      0.85      0.76      5552
           1       0.81      0.63      0.71      5812

    accuracy                           0.74     11364
   macro avg       0.75      0.74      0.74     11364
weighted avg       0.75      0.74      0.74     11364

