In [None]:
!pip uninstall -y pyarrow datasets
!pip install --no-use-pep517 pyarrow
!pip install datasets
!pip install torch transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install scikit-learn

In [90]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import s3fs
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [91]:
# Construct the S3 endpoint URL
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# List files from challenge
fs.ls("civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv")

# Download datas from the service
PATH_IN = 'civel/diffusion/hackathon-minarm-2024/AIVSAI/hack_train.csv'
fs.download(PATH_IN, 'data/hack_train.csv')

# Read the csv
df = pd.read_csv('data/hack_train.csv')

# Create a Dataset from the data
data_dict = {
    "texts": df['text'],
    "labels": df['label']
}
hf_dataset = Dataset.from_dict(data_dict)

In [92]:
# Retrieve the texts and labels from the dataset.
texts = hf_dataset["texts"]
labels = hf_dataset["labels"]

# Vectorize the texts, which converts the text data into numerical feature vectors.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, np.array(labels), test_size=0.2, random_state=42)

In [93]:
# Initialize the logistic regression model
model = LogisticRegression()

# Train it
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
# Result
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print("Matrice de confusion:\n", conf_matrix)
print("\nRapport de classification:\n", class_report)

Matrice de confusion:
 [[4713  839]
 [ 858 4954]]

Rapport de classification:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      5552
           1       0.86      0.85      0.85      5812

    accuracy                           0.85     11364
   macro avg       0.85      0.85      0.85     11364
weighted avg       0.85      0.85      0.85     11364

