In [218]:
from datasets import load_dataset
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
import nltk

In [219]:
#Toutes les fontions

#tokeniser
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

def normalize(token):
  #minuscule
  token = token.lower()
  #ponctuation
  token = re.sub(r'[^\w\s]', '', token)
  #retrait espaces
  token = re.sub(r'\s+', ' ', token).strip()
  return token

#enlever les stopword
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords.words("english")]
    return filtered_tokens

#lemmatisation
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

#faire un prompt
def make_prompt(text, few_shot=False):
    if few_shot:
        return f"""
Tu es un expert en des film et tu en a regardé plusieurs. Détermine si l'avis sur les film est positif ou négatif.
positif = 1 et négatif = 0
Les commentaire sont en anglais
Exemples :
- "This movie is a great" → 1
- "George P. Cosmatos\' "Rambo: First Blood Part II" is pure wish-fulfillment. The United States clearly didn\'t win the war in Vietnam. They caused damage to this country beyond the imaginable and this movie continues the fairy story of the oh-so innocent soldiers. The only bad guys were the leaders of the nation, who made this war happen. The character of Rambo is perfect to notice this. He is extremely patriotic, bemoans that US-Americans didn\'t appreciate and celebrate the achievements of the single soldier, but has nothing but distrust for leading officers and politicians. Like every film that defends the war (e.g. "We Were Soldiers") also this one avoids the need to give a comprehensible reason for the engagement in South Asia. And for that matter also the reason for every single US-American soldier that was there. Instead, Rambo gets to take revenge for the wounds of a whole nation. It would have been better to work on how to deal with the memories, rather than suppressing them." → 0
Texte : {text}
Avis :

"""
    else:
        return f"Détermine si le commentaire suivant est positif ou négatif : {text}\nGenre :"




In [220]:
# Chargement et mélange du dataset IMDb
ds_train = load_dataset("imdb", split="train")
ds_train = ds_train.shuffle(seed=42)

# Sélection de 2000 exemples mélangés
dataset = ds_train.select(range(2000))


# Vérification rapide
print(dataset)
print(dataset[0])

Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})
{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}


Description du dataset:

Nous avons 2000 exemple et 2 features : text, label

In [221]:
#nombre de ligne
nb_ligne = dataset.num_rows

#les colonnes
col = dataset.features

print(f"Nombre de ligne: {nb_ligne}")
print(f"les colonnes: {col}")

Nombre de ligne: 2000
les colonnes: {'text': Value('string'), 'label': ClassLabel(names=['neg', 'pos'])}


In [222]:
#tranformation du dataset en dataframe
dataframe = pd.DataFrame(dataset)
dataframe.head(5)
dataframe.loc[2, "text"]

'George P. Cosmatos\' "Rambo: First Blood Part II" is pure wish-fulfillment. The United States clearly didn\'t win the war in Vietnam. They caused damage to this country beyond the imaginable and this movie continues the fairy story of the oh-so innocent soldiers. The only bad guys were the leaders of the nation, who made this war happen. The character of Rambo is perfect to notice this. He is extremely patriotic, bemoans that US-Americans didn\'t appreciate and celebrate the achievements of the single soldier, but has nothing but distrust for leading officers and politicians. Like every film that defends the war (e.g. "We Were Soldiers") also this one avoids the need to give a comprehensible reason for the engagement in South Asia. And for that matter also the reason for every single US-American soldier that was there. Instead, Rambo gets to take revenge for the wounds of a whole nation. It would have been better to work on how to deal with the memories, rather than suppressing them. 

In [223]:
#description des colonnes
dataframe.info()

# "label" sont des entier
# 0=négatif 1= positif
# "text" sont des textes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2000 non-null   object
 1   label   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.4+ KB


Prétraitement  des données

In [224]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


#tokeniser
dataframe['text'] = dataframe['text'].apply(tokenize)
dataframe['text'].head(1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text
0,"[There, is, no, relation, at, all, between, Fo..."


In [225]:
#normalize
dataframe['text'] = dataframe['text'].apply(lambda tokens: [normalize(token) for token in tokens])
dataframe['text'].head(1)

Unnamed: 0,text
0,"[there, is, no, relation, at, all, between, fo..."


In [226]:
#stopword
dataframe['text'] = dataframe['text'].apply(remove_stopwords)
dataframe['text'].head(1)

Unnamed: 0,text
0,"[relation, fortier, profiler, fact, police, se..."


In [227]:
#lemmatisation
dataframe['text'] = dataframe['text'].apply(lemmatize)
dataframe['text'].head(1)

Unnamed: 0,text
0,"[relation, fortier, profiler, fact, police, se..."


In [228]:
#Représentation TD-IDF
vectorizer = TfidfVectorizer(max_features=5000) #création d'un vecteur
corpus = dataframe["text"].astype(str) #application sur les datas
dataframe_tfidf = vectorizer.fit_transform(corpus)
print(dataframe_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [229]:
#division train/test
X_train, X_test, y_train, y_test = train_test_split(dataframe_tfidf, dataframe['label'], test_size=0.2, random_state=42)

In [230]:
#entrainement
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

Evalution

In [231]:
dataframe_evaluation = pd.DataFrame(columns=["nom_eval", "valeur"])
dataframe_evaluation

Unnamed: 0,nom_eval,valeur


In [232]:
#accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
dataframe_evaluation.loc[0] = {"nom_eval": "accuracy", "valeur": accuracy}
print(f"Accuracy: {accuracy}")
dataframe_evaluation

Accuracy: 0.845


Unnamed: 0,nom_eval,valeur
0,accuracy,0.845


In [233]:
#Recall
recall = recall_score(y_test, y_pred)
dataframe_evaluation.loc[1] = {"nom_eval": "recall", "valeur": recall}
print(f"Recall: {recall}")
dataframe_evaluation

Recall: 0.84


Unnamed: 0,nom_eval,valeur
0,accuracy,0.845
1,recall,0.84


In [234]:
#precision
precision = precision_score(y_test, y_pred)
dataframe_evaluation.loc[2] = {"nom_eval": "precision", "valeur": precision}
print(f"Precision: {precision}")
dataframe_evaluation

Precision: 0.8484848484848485


Unnamed: 0,nom_eval,valeur
0,accuracy,0.845
1,recall,0.84
2,precision,0.848485


In [235]:
#F1-score
f1_score = f1_score(y_test, y_pred)
dataframe_evaluation.loc[3] = {"nom_eval": "f1_score", "valeur": f1_score}
print(f"F1-score: {f1_score}")
dataframe_evaluation

F1-score: 0.8442211055276382


Unnamed: 0,nom_eval,valeur
0,accuracy,0.845
1,recall,0.84
2,precision,0.848485
3,f1_score,0.844221


In [236]:
dataframe_evaluation

Unnamed: 0,nom_eval,valeur
0,accuracy,0.845
1,recall,0.84
2,precision,0.848485
3,f1_score,0.844221


Quelques soit la métrique choisi, il nous montre à peu près le même précision sur les résultats

Le modèle a prédit juste 84,50% de l'ensemble du data (accuracy)
Le modèle a prédit 84,84% des valeur vrai et qui sont exacte(precision)
Le modèle a trouvé 84% de film apprécié dans le dataset et qui sont exacte(recall)


LLM

In [237]:
!pip install transformers accelerate datasets scikit-learn --quiet

In [238]:
from transformers import pipeline


In [240]:
#prompt
generate = pipeline("text-generation",
                    model="HuggingFaceH4/zephyr-7b-beta",
                    device_map="auto",
                    max_new_tokens=1000,
                    temperature=0.1
                    )

#classifier
def classify(text, few_shot=True):
    prompt = make_prompt(text)
    out = generate(prompt)[0]["generated_text"]
    avis = out.split("Avis:")[-1].strip().split("\n")[0] #extraction de la prédiction
    return avis

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# Générer les prédictions
y_true, y_pred = [], []

for example in dataframe:
    y_true.append(example[1])
    pred = classify(example[1], few_shot=True)
    print(f" {pred}")
    y_pred.append(pred)

In [None]:
#evaluation
print("Accuracy:", accuracy_score(y_true, y_pred))
print("F1-score:", f1_score(y_true, y_pred, average="weighted"))
print("Recall:", recall_score(y_true, y_pred, average="weighted"))
print("Precision:", precision_score(y_true, y_pred, average="weighted"))