# Approach 1 : NLI with XLM Roberta

In [None]:

import pandas as pd
from transformers import pipeline
import torch
from typing import List, Dict, Tuple
import logging
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class NLIActivityEvaluator:
    def __init__(self, device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
        """Initialize the NLI-based activity evaluator."""
        self.device = device
        self.classifier = pipeline(
            "zero-shot-classification",
            model="joeddav/xlm-roberta-large-xnli",
            device=self.device
        )
        
        # Define hypothesis templates for different criteria
        self.artisan_hypotheses = [
            "This activity involves manual craftsmanship or artistic creation.",
            "This activity involves traditional material transformation.",
            "This activity is small-scale and quality-focused.",
            "This activity contributes to cultural or heritage preservation.",
            "This activity follows environmental sustainability principles."
        ]
        
        self.farmer_hypotheses = [
            "This activity is related to agricultural sectors.",
            "This activity involves crop cultivation or livestock farming.",
            "This activity has a structured production cycle.",
            "This activity is integrated into agricultural supply chains.",
            "This activity is suitable for local farming conditions."
        ]

    def evaluate_against_criteria(
        self, 
        text: str, 
        hypotheses: List[str], 
        threshold: float = 0.7
    ) -> Tuple[bool, Dict[str, float]]:
        """
        Evaluate text against a set of hypotheses using NLI.
        Returns whether any criterion was met and detailed scores.
        """
        results = self.classifier(
            text,
            hypotheses,
            multi_label=True
        )
        
        # Get scores for each hypothesis
        scores = dict(zip(results['labels'], results['scores']))
        
        # Check if any criterion exceeds the threshold
        meets_criteria = any(score > threshold for score in results['scores'])
        
        return meets_criteria, scores

    def evaluate_activity(
        self, 
        activity: str, 
        description: str, 
        threshold: float = 0.7
    ) -> Dict:
        """Evaluate a single activity against all criteria."""
        # Combine activity and description for better context
        full_text = f"{activity}. {description}"
        
        try:
            # Check artisan criteria
            is_artisan, artisan_scores = self.evaluate_against_criteria(
                full_text, 
                self.artisan_hypotheses, 
                threshold
            )
            
            # Check farmer criteria
            is_farmer, farmer_scores = self.evaluate_against_criteria(
                full_text, 
                self.farmer_hypotheses, 
                threshold
            )
            
            # Determine decision
            if is_artisan or is_farmer:
                decision = "REJECTED"
                reasons = []
                if is_artisan:
                    reasons.append("Qualifies for artisan card")
                if is_farmer:
                    reasons.append("Qualifies for farmer card")
                reason = " and ".join(reasons)
            else:
                decision = "APPROVED"
                reason = None
            
            return {
                'activity': activity,
                'decision': decision,
                'reason': reason,
                'artisan_scores': artisan_scores,
                'farmer_scores': farmer_scores,
                'status': 'success'
            }
            
        except Exception as e:
            logger.error(f"Error evaluating activity '{activity}': {str(e)}")
            return {
                'activity': activity,
                'decision': 'ERROR',
                'reason': str(e),
                'status': 'error'
            }

    def evaluate_activities(self, df: pd.DataFrame) -> pd.DataFrame:
        """Evaluate multiple activities from a DataFrame."""
        results = []
        
        for _, row in tqdm(df.iterrows(), total=len(df)):
            result = self.evaluate_activity(row['activity'], row['description'])
            results.append(result)
            
        return pd.DataFrame(results)

def main():
    # Sample data
    sample_data = pd.read_excel('/kaggle/input/activities-hackathon/activities-hackathon/3000-proposition mars 2024.xlsx')

    try:
        evaluator = NLIActivityEvaluator()
        results_df = evaluator.evaluate_activities(sample_data)
        
        # Save results with detailed scores
        results_df.to_csv('nli_activity_evaluation_results.csv', index=False)
        
        # Print summary
        print("\nEvaluation Summary:")
        print(f"Total activities evaluated: {len(results_df)}")
        print(f"Approved activities: {len(results_df[results_df['decision'] == 'APPROVED'])}")
        print(f"Rejected activities: {len(results_df[results_df['decision'] == 'REJECTED'])}")
        print(f"Errors: {len(results_df[results_df['decision'] == 'ERROR'])}")
        
    except Exception as e:
        logger.error(f"Application error: {str(e)}")
        raise

if __name__ == "__main__":
    main()


# Approach 2 : Gemini

In [1]:
!pip install langchain_google_genai
!pip install dotenv

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.10-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting langchain-core<0.4.0,>=0.3.37 (from langchain_google_genai)
  Downloading langchain_core-0.3.37-py3-none-any.whl.metadata (5.9 kB)
Downloading langchain_google_genai-2.0.10-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading langchain_core-0.3.37-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.7/413.7 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: filetype, langchain-core, langchain_google_genai
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.25
    U

In [3]:
import json
import os
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import time

# Set the path to your .env file in Kaggle
env_path = '/kaggle/input/nv-folder-nv-folder/env_folder/.env'
load_dotenv(env_path)

# Verify the API key is loaded
if not os.getenv('GOOGLE_API_KEY'):
    raise ValueError("Google API Key not found in .env file")

print("AAPPP")
print(os.getenv('GOOGLE_API_KEY'))



def is_valid_activity(activity, description):
    """
    Function to determine if an activity is approved or rejected.
    Returns a dictionary containing the validation result and explanation.
    """
    is_valid, ai_explanation = check_valid_logic(activity, description)
    return {"is_valid": is_valid, "ai_explanation": ai_explanation}

def check_valid_logic(activity, description) -> tuple[str, str]:
    """
    Checks if an activity is valid based on defined criteria using the Gemini model.
    Returns a tuple of (is_valid, explanation).
    """
    load_dotenv()
    
    # Initialize the model
    model = ChatGoogleGenerativeAI(
        model="gemini-pro",
        timeout=60,
        max_retries=5,
    )

    criteria = '''
        ---
        
        **Rejection Conditions (The activity will be rejected if it meets ANY of the following):**
            
        **1. It is eligible for an artisan card** based on the following:  
        - Listed in the official nomenclature of artisanal activities.  
        - Requires manual craftsmanship, artistic creation, or traditional material transformation.  
        - Small-scale, quality-focused, and involves human intervention.  
        - Contributes to cultural or heritage preservation.  
        - Follows environmental sustainability principles.  
        
        **2. It is eligible for a farmer card** based on the following:  
        - Related to recognized agricultural sectors (crop cultivation, livestock farming, horticulture, agroforestry, etc.).  
        - Structured production cycle (planting, maintenance, harvesting, etc.).  
        - Integrated into local or national supply chains.  
        - Suitable for local climate/ecological conditions.

        **3. It requires a Commercial Registry (سجل تجاري)** if it falls under any of these categories:
        
        A. Regulated Professions (المهن المنظمة):
        - Basic professional services (accountant, notary, bailiff, etc.)
        - Liberal professions (doctor, surgeon, veterinarian, real estate agent)
        - Service centers (call centers, internet cafes, party halls)
        - Tourism services (travel agencies, hotels, camping sites)
        - Activities involving hazardous chemicals and materials
        
        B. List of Regulated Activities (قائمة الأنشطة المنظمة):
        - Industrial production (metal pitch, gas, pharmaceuticals, fertilizers, flags/emblems, weapons, tobacco)
        - Wholesale markets (fruits/vegetables, phytosanitary products, veterinary pharmaceuticals, lubricants)
        - Retail markets (beverages, fuel, street vendors, phytosanitary products)
        - Import/Export (tobacco, fuel, phytosanitary pharmaceutical products)
        - Services (hotels, restaurants, cafes, tea rooms, ambulance, public transport, air transport, railways, gyms, nurseries, stamp/seal making, currency exchange)
        ---
    '''

    prompt_template_messages = [
        ("human", """
        As an expert evaluator of activities based on regulatory criteria:
        
        You will receive an activity suggestion formatted as follows:
        ['Activity':{{}}, 'Description':{{}}]
        
        Possible languages: Arabic, French, and English.
        
        Your task is to evaluate the suggested activity based on the following regulatory criteria:
        {criteria}
        
        **IMPORTANT: Your decision MUST be EXACTLY one of these three values:**
        - "APPROVED"
        - "REJECTED"
        - "LACK OF INFORMATION"
        Any other response format or value is not acceptable.
        
        **Decision Rules:** 
        1. If the activity does not qualify for any of: artisan card, farmer card, or commercial registry → "APPROVED"
        2. If the activity qualifies for any of the above → "REJECTED"
        3. If the activity name and description are not clear enough to make a decision → "LACK OF INFORMATION"
          
        **Output Format:**
        You MUST return a JSON object with exactly this structure:
        {{
            "decision": "APPROVED" | "REJECTED" | "LACK OF INFORMATION",
            "reason": "Detailed explanation (only required for REJECTED decision)"
        }}
        
        **Now, evaluate the following activity suggestion:**  
        {activity_suggestion}
        """)
    ]


    prompt_template = ChatPromptTemplate.from_messages(prompt_template_messages)
    activity_suggestion = f"['Activity':{activity}, 'Description':{description}]"
    
    try:
        prompt = prompt_template.invoke({
            "activity_suggestion": activity_suggestion,
            "criteria": criteria,
        })

        result = model.invoke(prompt)

        
        # Extract JSON from the response content
        try:
            # Try to parse the content directly
            result_dict = json.loads(result.content)
        except json.JSONDecodeError:
            # If direct parsing fails, try to extract JSON from the markdown code block
            content = result.content
            if "```json" in content and "```" in content:
                json_str = content.split("```json")[1].split("```")[0].strip()
                result_dict = json.loads(json_str)
            else:
                raise ValueError("Could not extract valid JSON from the response")

        decision = result_dict.get("decision", "REJECTED")  # Default to REJECTED if not found
        reason = result_dict.get("reason", "No reason provided")  # Default reason if not found
        
        is_valid = decision
        ai_explanation = reason if is_valid != "APPROVED" else "Activity approved"
        
        return is_valid, ai_explanation

    except Exception as e:
        print(f"Error processing activity: {str(e)}")
        return False, f"Error processing activity: {str(e)}"
    

from tqdm import tqdm  # For progress tracking

def load_checkpoint(checkpoint_file):
    """
    Load the last processed state from checkpoint file if it exists.
    """
    try:
        if os.path.exists(checkpoint_file):
            checkpoint_df = pd.read_csv(checkpoint_file)
            last_processed_index = checkpoint_df.index.max()
            return checkpoint_df, last_processed_index
        return None, -1
    except Exception as e:
        print(f"Error loading checkpoint: {str(e)}")
        return None, -1

def process_dataframe(df, save_interval=5, sleep_interval=1.5, checkpoint_file='validation_checkpoint.csv'):
    """
    Process all rows in the DataFrame and add Gemini's decision with periodic saving.
    
    Args:
        df (pandas.DataFrame): DataFrame containing activities to validate
        save_interval (int): Number of rows to process before saving
        checkpoint_file (str): File to save checkpoints
        
    Returns:
        pandas.DataFrame: DataFrame with added 'gemini_decision' column
    """
    # Load checkpoint if exists
    checkpoint_df, last_processed_index = load_checkpoint(checkpoint_file)
    
    if checkpoint_df is not None:
        print(f"Resuming from checkpoint at index {last_processed_index}")
        result_df = checkpoint_df
        start_index = last_processed_index + 1
    else:
        result_df = df.copy()
        result_df['gemini_decision'] = None
        start_index = 0
    
    # Process each row with progress bar
    print("Processing activities...")
    try:
        for idx in tqdm(range(start_index, len(result_df))):
            activity = result_df.loc[idx, 'activity']
            description = result_df.loc[idx, 'description']
            
            # Get validation result
            validation_result = is_valid_activity(activity, description)
            
            # Extract decision from the validation result
            result_df.loc[idx, 'gemini_decision'] = validation_result['ai_explanation'].split(':')[0] if ':' in validation_result['ai_explanation'] else validation_result['is_valid']
            
            # Save checkpoint every save_interval rows
            if (idx + 1) % save_interval == 0:
                print(f"\nSaving checkpoint at index {idx}")
                result_df.to_csv(checkpoint_file, index=False)
            time.sleep(sleep_interval)
                
        # Save final results
        result_df.to_csv('validated_results.csv', index=False)
        # Clean up checkpoint file after successful completion
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
            
    except Exception as e:
        print(f"\nError during processing at index {idx}: {str(e)}")
        print("Saving current progress...")
        result_df.to_csv(checkpoint_file, index=False)
        raise e
    
    return result_df

try:
    # Load the CSV file
    df1 = pd.read_csv('/kaggle/input/balanced-validate-data-csv/balanced_validate_data.csv')
    
    # Process the DataFrame with periodic saving every 5 rows
    processed_df = process_dataframe(df1, save_interval=5)
    
    # Display summary of decisions
    decision_counts = processed_df['gemini_decision'].value_counts()
    print("\nDecision Summary:")
    print(decision_counts)
    
except Exception as e:
    print(f"Error processing file: {str(e)}")



AAPPP
AIzaSyDoA2MobX7w0DwKhNLJ3gIqso4y28kqC7M
Processing activities...


 80%|████████  | 4/5 [00:10<00:02,  2.50s/it]


Saving checkpoint at index 4


100%|██████████| 5/5 [00:12<00:00,  2.55s/it]


Decision Summary:
gemini_decision
LACK OF INFORMATION    2
REJECTED               2
APPROVED               1
Name: count, dtype: int64





In [1]:
#!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# Approach 3 : finetuning xlm-roberta-base

In [2]:

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
import torch
import evaluate
import transformers
from datasets import ClassLabel
from sklearn.metrics import classification_report

transformers.logging.set_verbosity_info()

def prepare_data(df):
    """Prepare the data for training."""
    df['activity'] = df['activity'].fillna('')
    df['description'] = df['description'].fillna('')
    df['text'] = df['activity'] + ' [SEP] ' + df['description']
    
    df['label'] = df['validation de la proposition '].map({
        "N'est pas conforme": 0,
        'validée ': 1,
        'manque de precision': 2
    })
    
    dataset = Dataset.from_pandas(df[['text', 'label']])
    class_label = ClassLabel(names=['NonConforme', 'Validee', 'ManquePrecision'])
    dataset = dataset.cast_column('label', class_label)

    return dataset

def compute_metrics(eval_pred):
    """Compute metrics for evaluation."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        'precision': precision.compute(predictions=predictions, references=labels, average='weighted')['precision'],
        'recall': recall.compute(predictions=predictions, references=labels, average='weighted')['recall'],
        'f1': f1.compute(predictions=predictions, references=labels, average='weighted')['f1']
    }

def train_classifier(data_path, model_name="xlm-roberta-base", output_dir="./results"):
    """Train the multilingual classifier and return a classification report."""
    df = pd.read_csv(data_path)
    dataset = prepare_data(df)
    
    train_testvalid = dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="label")
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42, stratify_by_column="label")
    
    train_dataset = train_testvalid['train']
    valid_dataset = test_valid['train']
    test_dataset = test_valid['test']
    
    print("\nDataset splits:")
    print(f"Training set size: {len(train_dataset)}")
    print(f"Validation set size: {len(valid_dataset)}")
    print(f"Test set size: {len(test_dataset)}")

    # Print class distributions in each split
    print("\nClass distributions:")
    print("Validation set:", np.bincount(valid_dataset['label']))
    print("Test set:", np.bincount(test_dataset['label']))
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
    
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_valid = valid_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_strategy="steps",
        logging_steps=10,
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    # Evaluate on test set
    test_results = trainer.evaluate(tokenized_test)
    
    # Get predictions on test set
    predictions = trainer.predict(tokenized_test)
    predicted_labels = np.argmax(predictions.predictions, axis=1)
    true_labels = predictions.label_ids

    # Generate classification report
    class_names = ['NonConforme', 'Validee', 'ManquePrecision']
    class_report = classification_report(true_labels, predicted_labels, target_names=class_names, digits=4)
    
    print("\nClassification Report:\n", class_report)
    
    return trainer, test_results, class_report

model, results, class_report = train_classifier("/kaggle/input/validate-datacsv/validate_data.csv")


Casting the dataset:   0%|          | 0/2253 [00:00<?, ? examples/s]


Dataset splits:
Training set size: 1802
Validation set size: 225
Test set size: 226

Class distributions:
Validation set: [179  32  14]
Test set: [180  32  14]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

loading file sentencepiece.bpe.model from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/sentencepiece.bpe.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_pr

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/model.safetensors
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

PyTorch: setting up devices
  trainer = Trainer(
The following columns in the training set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,802
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 339
  Number of trainable parameters = 278,045,955


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6608,0.506201,0.826667,0.780686,0.826667,0.768992


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 225
  Batch size = 16


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/checkpoint-113
Configuration saved in ./results/checkpoint-113/config.json
Model weights saved in ./results/checkpoint-113/model.safetensors
tokenizer config file saved in ./results/checkpoint-113/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-113/special_tokens_map.json


KeyboardInterrupt: 

Casting the dataset:   0%|          | 0/2253 [00:00<?, ? examples/s]


Dataset splits:
Validation set size: 225
Test set size: 226

Class distributions:
Validation set: [179  32  14]
Test set: [180  32  14]


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading file sentencepiece.bpe.model from cache at /root/.cache/huggingface/hub/models--xlm-roberta-base/snapshots/e7363

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

In [20]:

from sklearn.metrics import classification_report

predictions = model.predict(tokenized_test)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = tokenized_test['label']

print(classification_report(true_labels, predicted_labels, target_names=['NonConforme', 'Validee', 'ManquePrecision']))


The following columns in the test set don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 226
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

    NonConforme       0.89      0.95      0.92       180
        Validee       0.71      0.75      0.73        32
ManquePrecision       0.00      0.00      0.00        14

       accuracy                           0.86       226
      macro avg       0.53      0.57      0.55       226
   weighted avg       0.81      0.86      0.84       226



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
