In [40]:
from PIL import Image
import pytesseract
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Télécharger les stop words
nltk.download('stopwords')
french_stop_words = stopwords.words('french')

def extract_text_from_image(image_path):
    """Extract text from an image using Tesseract OCR."""
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img, lang='fra+ara')  # Use 'fra' for French text
        return text
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

def extract_text_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            text = extract_text_from_image(image_path)
            texts.append(text)
    return texts

# Extract text from both folders
bulletin_texts = extract_text_from_folder("C:/Users/Eya/Documents/Esprit/PI/dataset/bulletin/")
other_texts = extract_text_from_folder("C:/Users/Eya/Documents/Esprit/PI/dataset/ordonnance/")

# Create a DataFrame
data = {
    "text": bulletin_texts + other_texts,
    "label": [1] * len(bulletin_texts) + [0] * len(other_texts)  # 1 = bulletin de soin, 0 = other
}
df = pd.DataFrame(data)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words=french_stop_words)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model with hyperparameter tuning
model = LogisticRegression(class_weight='balanced')  # Utiliser class_weight pour gérer le déséquilibre
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = grid_search.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def classify_document(image_path):
    """Classify a new document as 'bulletin de soin' or not."""
    text = extract_text_from_image(image_path)
    text_tfidf = vectorizer.transform([text])
    prediction = grid_search.predict(text_tfidf)
    return "bulletin de soin" if prediction[0] == 1 else "Ordonnance"

# Test on a new document
new_document_path = "C:/Users/Eya/Documents/Esprit/PI/dataset/bulletin/star.jpeg"#Documents/Esprit/PI/dataset/bulletin/bb.jpg"
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         6

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

The document is classified as: bulletin de soin


In [1]:
from PIL import Image
import pytesseract
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Télécharger les stop words
nltk.download('stopwords')
french_stop_words = stopwords.words('french')

def extract_text_from_image(image_path):
    """Extract text from an image using Tesseract OCR."""
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img, lang='fra+ara')  # Use 'fra' for French text
        return text
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

def extract_text_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            text = extract_text_from_image(image_path)
            texts.append(text)
    return texts

# Extract text from both folders
bulletin_texts = extract_text_from_folder("C:/Users/Eya/Documents/Esprit/PI/dataset/bulletin/")
other_texts = extract_text_from_folder("C:/Users/Eya/Documents/Esprit/PI/dataset/ordonnance/")

# Create a DataFrame
data = {
    "text": bulletin_texts + other_texts,
    "label": [1] * len(bulletin_texts) + [0] * len(other_texts)  # 1 = bulletin de soin, 0 = other
}
df = pd.DataFrame(data)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words=french_stop_words)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model with hyperparameter tuning
model = LogisticRegression(class_weight='balanced')  # Utiliser class_weight pour gérer le déséquilibre
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = grid_search.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def classify_document(image_path, confidence_threshold=0.55):
    """
    Classify a new document as 'bulletin de soin', 'ordonnance', or 'unrecognized type'.
    Args:
        image_path (str): Path to the image file.
        confidence_threshold (float): Minimum probability threshold to consider the prediction as reliable.
    Returns:
        str: Classification result.
    """
    text = extract_text_from_image(image_path)
    text_tfidf = vectorizer.transform([text])
    
    # Get prediction probabilities
    probabilities = grid_search.predict_proba(text_tfidf)[0]
    max_probability = max(probabilities)
    
    # Check if the maximum probability is above the threshold
    if max_probability < confidence_threshold:
        return "unrecognized type"
    
    # Otherwise, return the predicted class
    prediction = grid_search.predict(text_tfidf)
    return "bulletin de soin" if prediction[0] == 1 else "ordonnance"

# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/5b9584a0-affa-4142-9b5c-7c310de48358 (1).png"  #/Esprit/PI/dataset/ordonnance/oo.png"  # Remplacez par le chemin de votre image
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

  from pandas.core import (
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         6

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

The document is classified as: ordonnance


In [32]:
# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/btest.jpg"  #/Esprit/PI/dataset/ordonnance/oo.png"  
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

The document is classified as: bulletin de soin


In [33]:
# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/bb3.png"  #/Esprit/PI/dataset/ordonnance/oo.png"  
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

The document is classified as: bulletin de soin


In [34]:
# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/OIP.jpg"  #/Esprit/PI/dataset/ordonnance/oo.png"  
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

The document is classified as: ordonnance


In [36]:
# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/IO.jpg"  #/Esprit/PI/dataset/ordonnance/oo.png"  
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

The document is classified as: ordonnance


In [37]:
# Test on a new document
new_document_path = "C:/Users/Eya/Downloads/single.jpg"  #/Esprit/PI/dataset/ordonnance/oo.png" 
result = classify_document(new_document_path)
print(f"The document is classified as: {result}")

The document is classified as: bulletin de soin
