<a href="https://colab.research.google.com/github/BAHAJ-UH1/DetectionContours/blob/master/JKSUCIS__1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers torch nltk



In [2]:
import torch
from torch import nn
from transformers import XLMRobertaModel, XLMRobertaTokenizer
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn

# Téléchargement des ressources nltk nécessaires
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

# Fonction de lemmatisation
lemmatizer = nltk.WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """
    Convertit les tags de POS de TreeBank en tags de WordNet pour lemmatizer
    """
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

# Phrase d'exemple
sentence = "Le service du restaurant était excellent mais la nourriture était médiocre."

# Tokenisation et POS tagging avec NLTK
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

# Affichage des tokens, POS tags, et lemmes
print(f"Tokens: {tokens}")
print(f"POS Tags: {pos_tags}")
print(f"Lemmas: {lemmas}")

# Chargement du tokenizer et du modèle XLM-Roberta
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

# Tokenisation de la phrase pour XLM-Roberta
inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)

# Extraction des embeddings contextuels avec XLM-Roberta
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state

# Définition de la couche BiLSTM
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # hidden_dim * 2 because it's bidirectional

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        return output

# Dimensions des embeddings et du modèle
input_dim = embeddings.shape[-1]
hidden_dim = 128
output_dim = 64  # Par exemple, pour réduire la dimensionnalité

# Initialisation du modèle BiLSTM
bilstm_model = BiLSTMModel(input_dim, hidden_dim, output_dim)

# Passage des embeddings à travers le modèle BiLSTM
bilstm_embeddings = bilstm_model(embeddings)

# Affichage des dimensions des embeddings après BiLSTM
print(bilstm_embeddings.shape)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tokens: ['Le', 'service', 'du', 'restaurant', 'était', 'excellent', 'mais', 'la', 'nourriture', 'était', 'médiocre', '.']
POS Tags: [('Le', 'NNP'), ('service', 'NN'), ('du', 'NN'), ('restaurant', 'NN'), ('était', 'NNP'), ('excellent', 'NN'), ('mais', 'NN'), ('la', 'NN'), ('nourriture', 'NN'), ('était', 'NNP'), ('médiocre', 'NN'), ('.', '.')]
Lemmas: ['Le', 'service', 'du', 'restaurant', 'était', 'excellent', 'mais', 'la', 'nourriture', 'était', 'médiocre', '.']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

torch.Size([1, 15, 64])
