## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch 
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler, Dataset

# Hugging Face Transformers 
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import TrainerCallback
from transformers import Trainer
from transformers import TrainingArguments


# Scikit-learn packages for modeling and evaluation
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split


#!pip install GPUtil
'''
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import string
import re

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()
'''

### Data Loading

In [None]:
# Base path for Kaggle input directory
base_path = '/kaggle/input/projet-ter/'  # Update this path if it's different

In [None]:
file_names = [
    "articles_24 Heures au BCnin.www.24haubenin.info_.csv",
    "articles_Africa 24.www.youtube.com_channel_UCmEcEP_oCZJ6Mr1uxhUFyRg.csv",
    "articles_Agence de Presse Sngalaise Youtube.www.youtube.com_channel_UC8uoOv4RSzdZKlmPTguYjtg.csv",
    "articles_Agence de Presse Sngalaise.www.aps.sn_.csv",
    "articles_Banouto Media.www.youtube.com_channel_UCwsXkG4LatsF7u_b8dU1p-w.csv",
    "articles_Banouto.www.banouto.bj_.csv",
    "articles_Burkina24 Youtube.www.youtube.com_channel_UCJtaDORHQO20XA-tFwpJysQ.csv",
    "articles_Burkina24.burkina24.com_.csv",
    "articles_Dakaractu TV HD.www.youtube.com_channel_UCG0t6XiAHui-ziz7SwFTN0g.csv",
    "articles_Dakaractu.www.dakaractu.com_.csv",
    "articles_Fraternit.www.fraternitebj.info_.csv",
    "articles_JeuneAfrique Youtube.www.youtube.com_channel_UCWkbzzrku8lwKK6DoBl4yTg.csv",
    "articles_JeuneAfrique.www.jeuneafrique.com_.csv",
    "articles_LObs.www.lobs.sn_.csv",
    "articles_La Nation.lanation.bj_.csv",
    "articles_La Nouvelle Tribune.lanouvelletribune.info_.csv",
    "articles_Le Matinal.groupelematinal.com_category_actualites_.csv",
    "articles_Le Quotidien.lequotidien.sn_.csv",
    "articles_ORTB.www.youtube.com_channel_UCmPXzeJaO7nrA87GIz3N6wQ.csv",
    "articles_RTB - Radiodiffusion Tlvision du Burkina.www.youtube.com_channel_UCZl9utbYlPMssMhgrGUqXZA.csv",
    "articles_SIKKA TELEVISION.www.youtube.com_channel_UCplwKOWLV8s2XZBMsimOjvg.csv",
    "articles_Senegal7.www.youtube.com_channel_UC5eVGjO4ITJA1KM_tva_OSQ.csv",
    "articles_Sud Quotidien TV.www.youtube.com_channel_UCD-YykHgK3BOvnqFlcOFXIA.csv",
    "articles_Sud Quotidien.www.sudquotidien.sn_.csv",
    "articles_TFM (Tl Futurs Medias).www.youtube.com_channel_UC5NQ49FVRIAuWE1el6L2gkg.csv",
    "articles_aCotonou Youtube.www.youtube.com_channel_UCdNz-U4WJwFvbMDMdgYNMrg.csv",
    "articles_aCotonou.news.acotonou.com_.csv",
    "articles_le faso.lefaso.net_.csv",
    "articles_le soleil.lesoleil.sn_.csv",
]

# Initialiser une liste vide pour stocker des trames de données
dfs = []

# Boucle principale
for file_name in file_names:
    file_path = f"{base_path}{file_name}"
    
    #  Lire chaque fichier dans une trame de données
    if file_name.endswith('.csv'):
        df = pd.read_csv(file_path)
        if 'source' in df.columns and not df['source'].str.contains('youtube.com').any():
            dfs.append(df)

# Concaténer toutes les données en une seule
df = pd.concat(dfs, ignore_index=True)

print(f"\nTotal number of records in the combined dataframe: {len(df)}")

In [None]:
df.head()

## Data Cleaning / Preprocessing

In [None]:
cols_of_interest = ['CLS_Lexique Sécurité Alimentaire', 'CLS_Relevance', 'CLS_Usefulness', 'CLS_Relevance Yes/No']

# Printer des valeurs uniques pour chaque colonne
for col in cols_of_interest:
    unique_values = df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

In [None]:
cols_of_interest = ['CLS_Lexique Sécurité Alimentaire', 'CLS_Relevance', 'CLS_Usefulness', 'CLS_Relevance Yes/No']

# Counting and printing unique values for each column of interest
for col in cols_of_interest:
    print(f"Counts of unique values in '{col}':")
    print(df[col].value_counts(dropna=False))  # Including NaN values in the count
    print()  # Just to add an empty line for better readability


In [None]:
filtered_df = df[df['CLS_Relevance Yes/No'].isin(['No (u)', 'Yes (u)'])]

filtered_df.head()

In [None]:
ivan_df = pd.read_csv('/kaggle/input/projet-ter/pertinence_Ivan.csv', sep = ';')
# Drop rows where 'label' column has NaN values
ivan_df = ivan_df.dropna(subset=['label'])

ivan_df = ivan_df[['id', 'label', 'text']]
ivan_df['label'] = ivan_df['label'].astype(int)



ivan_df.head()

In [None]:
guilhem_df = pd.read_csv('/kaggle/input/projet-ter/PERTINENCE MAIN GUILHEM.csv', sep = ';')
guilhem_df = guilhem_df[['PERTINENCE MAIN GUILHEM', 'id', 'text']]
guilhem_df = guilhem_df.rename(columns={'PERTINENCE MAIN GUILHEM': 'label'})

guilhem_df.head()

In [None]:
filtered_df = filtered_df[['CLS_Relevance Yes/No', 'id', 'text']].rename(columns={'CLS_Relevance Yes/No': 'label'})

filtered_df['label'] = filtered_df['label'].map({'No (u)': 0, 'Yes (u)': 1})

In [None]:
filtered_df.head()

In [None]:
labeled_df = pd.concat([guilhem_df, ivan_df, filtered_df], ignore_index=True)
# Finding duplicate ids with different labels
duplicates = labeled_df.groupby('id').filter(lambda x: len(x['label'].unique()) > 1)

if not duplicates.empty:
    print("There are duplicate ids with different labels:")
    print(duplicates)
else:
    print("No duplicate ids with conflicting labels found.")

In [None]:
labeled_df = labeled_df.drop_duplicates(subset='id', keep='first')

# Verify the result by checking for duplicates again, this should now come up empty for conflicting labels
duplicates_check = labeled_df.groupby('id').filter(lambda x: len(x['label'].unique()) > 1)

if not duplicates_check.empty:
    print("There are still duplicate ids with different labels:")
    print(duplicates_check[['id', 'label']])
else:
    print("No duplicate ids with conflicting labels found after cleanup.")

In [None]:
labeled_df.head()

In [None]:
# Vérifiez la distribution dans l'ensemble d'entrainement 
labeled_df = labeled_df.dropna(subset=['label'])
labeled_df['label'].unique()

In [None]:
data = {
    'label': ['0', '1', 'Pertinence main', 0, 1]
}

labeled_df = labeled_df[labeled_df['label'] != 'Pertinence main']

# Convert 'label' to integer
labeled_df['label'] = labeled_df['label'].astype(int)

labeled_df['label'].unique()

In [None]:
# Convert 'label' column to integers
labeled_df['label'] = labeled_df['label'].astype(int)

labeled_df['label'].value_counts()

In [None]:
combined_df = df[['CLS_Relevance Yes/No', 'id', 'text']].rename(columns={'CLS_Relevance Yes/No': 'label'})

combined_df.head()

In [None]:
combined_df['label'] = None
combined_df.head()

In [None]:
combined_df = combined_df[~combined_df[['id', 'text']].apply(tuple, 1).isin(labeled_df[['id', 'text']].apply(tuple, 1))]

combined_df.head()

In [None]:
combined_df.shape

## Model Initialization

In [None]:
# Check for GPU availability
if torch.cuda.is_available():    
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

### Camambert Tokenizer

In [None]:
# Load dataset
######labeled_df

# Split the initial DataFrame into train+validation and test sets
train_val_df, test_df = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df['label'], random_state=42)

In [None]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def tokenize_dataframe(df):
    return tokenizer(
        df['text'].tolist(), 
        padding=True, 
        truncation=True, 
        max_length=512,  # Or choose a length that suits your dataset
        return_tensors="pt"  # Return PyTorch tensors
    )

In [None]:
# Tokenize the train+validation and test sets
train_val_encodings = tokenize_dataframe(train_val_df)
test_encodings = tokenize_dataframe(test_df)

# wrap the tokenized data in a dataset class that can be used by PyTorch for training:

# Convert labels to list for compatibility
train_val_labels = train_val_df['label'].tolist()
test_labels = test_df['label'].tolist()

# Create the dataset for train+validation and test sets
train_val_dataset = TextDataset(train_val_encodings, train_val_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Now, let's split the train_val_dataset into separate training and validation sets
# Calculate the number of samples for the training set (90% of the train_val_dataset)
num_train_samples = int(0.9 * len(train_val_dataset))

# Calculate the number of samples for the validation set
num_val_samples = len(train_val_dataset) - num_train_samples

# Split the dataset
train_dataset, val_dataset = random_split(train_val_dataset, [num_train_samples, num_val_samples], generator=torch.Generator().manual_seed(42))

### initializing Parameters

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    conf_mat = confusion_matrix(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_mat.tolist()  # Convert to list for JSON serialization
    }

class MetricsCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if 'confusion_matrix' in metrics:
            print("Confusion Matrix:")
            print(metrics['confusion_matrix'])

In [None]:
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)  # Adjust num_labels as per your task

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=16,   
    weight_decay=0.01,               
    evaluation_strategy='epoch',     
    save_strategy='epoch',           
    logging_dir='./logs',            
    logging_steps=10,
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model="accuracy",
)

# Initialize the Trainer with one model instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[MetricsCallback()]
)


## WandB and Training 

In [None]:
import wandb
wandb.init(project="TER", entity="arisoy10")


In [None]:
trainer.train()  # Evaluate on the validation set

In [None]:
#trainer.evaluate()
predictions = trainer.predict(val_dataset)
##trainer.evaluate(test_dataset)


### Evaluating

In [None]:
# Step1
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Step 2: Extract actual labels from the validation dataset 'val_dataset'
actual_labels = [val_dataset[i]['labels'].item() for i in range(len(val_dataset))]

# Step 3: Identify indices of false positives
false_positives_indices = [i for i, (pred, actual) in enumerate(zip(predicted_labels, actual_labels)) if pred == 1 and actual == 0]

# Step 4: Decode and print false positives for review
for idx in false_positives_indices:
    # Assuming your dataset returns PyTorch tensors, use `.numpy()` to convert them for decoding
    input_ids = val_dataset[idx]['input_ids'].numpy()
    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    print(f"False Positive Text at index {idx}: {decoded_text}")

In [None]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

# Print the performance metrics
print("Test Performance:", test_results)