In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#Tokenization
nltk.download('punkt')
text = "This is a sample sentence."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Stop Words Removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
print(filtered_tokens)

['sample', 'sentence', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Stemming and Lemmatization
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Fonction de prétraitement du texte
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
# Exemple de jeu de données
texts = [
    "Energy efficiency tips for home",
    "Best renewable energy options available",
    "I need technical support for my solar panels",
    "Details about upcoming energy conference",
    "Information on energy saving programs and rebates",
    "How to reduce energy consumption in winter",
    "Benefits of using wind energy",
    "Problems with my energy meter, need support",
    "Register for the next energy workshop",
    "Apply for energy saving discounts"
]

labels = [
    "Energy Efficiency Tips",
    "Renewable Energy Options",
    "Technical Support",
    "Event Information",
    "Energy Saving Programs",
    "Energy Efficiency Tips",
    "Renewable Energy Options",
    "Technical Support",
    "Event Information",
    "Energy Saving Programs"
]

In [8]:
# Prétraitement des textes
texts = [preprocess_text(text) for text in texts]

In [9]:
# Pipeline avec TF-IDF et SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(probability=True))
])

In [10]:
# Entraînement du modèle
pipeline.fit(texts, labels)

In [11]:
# Évaluation
y_pred = pipeline.predict(texts)
accuracy = accuracy_score(labels, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(labels, y_pred))

Accuracy: 100.00%
                          precision    recall  f1-score   support

  Energy Efficiency Tips       1.00      1.00      1.00         2
  Energy Saving Programs       1.00      1.00      1.00         2
       Event Information       1.00      1.00      1.00         2
Renewable Energy Options       1.00      1.00      1.00         2
       Technical Support       1.00      1.00      1.00         2

                accuracy                           1.00        10
               macro avg       1.00      1.00      1.00        10
            weighted avg       1.00      1.00      1.00        10

