## <font color="#F0000"> Text summarization avec nltk

### <font color="#48dbfb"> Import les packages

In [24]:
# Importer packages
import base64
import streamlit as st
from PIL import ImageOps, Image
import os
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, BatchNormalization, Flatten, Dense
from keras.preprocessing.image import ImageDataGenerator
from bs4 import BeautifulSoup as bs
from requests import get
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### <font color="#48dbfb">Scraper des articles sur Wikipedia

In [3]:
# Importer des données sur la cuisine iraniéne
resp = get('https://en.wikipedia.org/wiki/Iranian_cuisine')
# resp = get('https://fr.wikipedia.org/wiki/Cuisine_iranienne')

article_soup = bs(resp.text)

paragraphs = article_soup.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [4]:
# Afficher article_text
article_text

'\nIranian cuisine (Persian: آشپزی ایرانی, romanized:\xa0Āshpazī Irānī) are the culinary traditions of Iran. Due to the historically common usage of the term "Persia" to refer to Iran in the Western world,[2][3][4] it is alternatively known as Persian cuisine, despite Persians being only one of a multitude of Iranian ethnic groups who have contributed to Iran\'s culinary traditions.[a]\nThe cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.[6][7][8][9] Aspects of Iranian cuisine have also been significantly adopted by Indian cuisine and Pakistani cuisine through various historical Persianate sultanates that flourished during Muslim rule on the Indian subcontinent, with the most notable and impactful of these polities being the Mughal Empire.[10][11][12]\nTypical Iranian main dishes 

### <font color="#48dbfb"> Nettoyer les données

In [5]:
# Supprimer [\w]*
article_text = re.sub(r'[[\w]*]', ' ', article_text)
# Supprimer les chaines de \xa0, \u200c
article_text = re.sub(r'\xa0|\u200c', ' ', article_text)
# Remplacer les espaces multiples par l'espace simple
article_text = re.sub(r'/s+', ' ', article_text)
# Remplacer l'espace en debut et fin de corpus
article_text = re.sub(r'^\s|\s$', '', article_text)

article_text = re.sub(r'\n', '', article_text)

  article_text = re.sub(r'[[\w]*]', ' ', article_text)


In [6]:
# Afficher article_text
article_text

'Iranian cuisine (Persian: آشپزی ایرانی, romanized: Āshpazī Irānī) are the culinary traditions of Iran. Due to the historically common usage of the term "Persia" to refer to Iran in the Western world,    it is alternatively known as Persian cuisine, despite Persians being only one of a multitude of Iranian ethnic groups who have contributed to Iran\'s culinary traditions. The cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.     Aspects of Iranian cuisine have also been significantly adopted by Indian cuisine and Pakistani cuisine through various historical Persianate sultanates that flourished during Muslim rule on the Indian subcontinent, with the most notable and impactful of these polities being the Mughal Empire.   Typical Iranian main dishes are combinations of rice with meat

### <font color="#48dbfb"> Text Summarization

#### <font color="#fd79a8">Tokeniser en phrase

In [7]:
# Tokeniser en phrase
sentence_list = nltk.sent_tokenize(article_text)

#### <font color="#fd79a8"> Calculer les frequences des mots

In [8]:
# Stopwords
stopwords = nltk.corpus.stopwords.words('english')
# Dictionnaire de fréquences des mots
word_frequencies = {}
for word in nltk.word_tokenize(article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [9]:
# Afficher word_frequencies
word_frequencies

{'Iranian': 48,
 'cuisine': 23,
 '(': 35,
 'Persian': 10,
 ':': 8,
 'آشپزی': 1,
 'ایرانی': 1,
 ',': 234,
 'romanized': 1,
 'Āshpazī': 1,
 'Irānī': 1,
 ')': 35,
 'culinary': 3,
 'traditions': 2,
 'Iran': 32,
 '.': 97,
 'Due': 1,
 'historically': 2,
 'common': 5,
 'usage': 2,
 'term': 2,
 '``': 12,
 'Persia': 1,
 "''": 10,
 'refer': 1,
 'Western': 1,
 'world': 4,
 'alternatively': 1,
 'known': 3,
 'despite': 1,
 'Persians': 1,
 'one': 9,
 'multitude': 1,
 'ethnic': 1,
 'groups': 2,
 'contributed': 1,
 "'s": 11,
 'The': 21,
 'made': 14,
 'extensive': 1,
 'contact': 1,
 'throughout': 1,
 'history': 2,
 'cuisines': 1,
 'neighbouring': 1,
 'regions': 3,
 'including': 5,
 'Caucasian': 1,
 'Central': 1,
 'Asian': 1,
 'Greek': 1,
 'Levantine': 1,
 'Mesopotamian': 1,
 'Russian': 1,
 'Turkish': 3,
 'Aspects': 1,
 'also': 23,
 'significantly': 2,
 'adopted': 2,
 'Indian': 2,
 'Pakistani': 1,
 'various': 15,
 'historical': 1,
 'Persianate': 1,
 'sultanates': 1,
 'flourished': 1,
 'Muslim': 2,
 'rul

#### <font color="#fd79a8"> Fréquence pondérée de chaque mots

In [10]:
# Fréquence maximale
maximum_frequency = max(word_frequencies.values())
# Calculer la fréquence pondérée
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / maximum_frequency

In [11]:
# afficher word_frequencies
word_frequencies

{'Iranian': 0.20512820512820512,
 'cuisine': 0.09829059829059829,
 '(': 0.14957264957264957,
 'Persian': 0.042735042735042736,
 ':': 0.03418803418803419,
 'آشپزی': 0.004273504273504274,
 'ایرانی': 0.004273504273504274,
 ',': 1.0,
 'romanized': 0.004273504273504274,
 'Āshpazī': 0.004273504273504274,
 'Irānī': 0.004273504273504274,
 ')': 0.14957264957264957,
 'culinary': 0.01282051282051282,
 'traditions': 0.008547008547008548,
 'Iran': 0.13675213675213677,
 '.': 0.41452991452991456,
 'Due': 0.004273504273504274,
 'historically': 0.008547008547008548,
 'common': 0.021367521367521368,
 'usage': 0.008547008547008548,
 'term': 0.008547008547008548,
 '``': 0.05128205128205128,
 'Persia': 0.004273504273504274,
 "''": 0.042735042735042736,
 'refer': 0.004273504273504274,
 'Western': 0.004273504273504274,
 'world': 0.017094017094017096,
 'alternatively': 0.004273504273504274,
 'known': 0.01282051282051282,
 'despite': 0.004273504273504274,
 'Persians': 0.004273504273504274,
 'one': 0.0384615384

#### <font color="#fd79a8"> Score des phrases

In [12]:
# Liste des scores de chaque phrase
sentence_scores = {}
# Calculer le score de chaque phrase
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [13]:
# Afficher sentences scores
sentence_scores

{'Iranian cuisine (Persian: آشپزی ایرانی, romanized: Āshpazī Irānī) are the culinary traditions of Iran.': 1.9145299145299144,
 'Typical Iranian main dishes are combinations of rice with meat, vegetables and nuts.': 1.7777777777777777,
 'Herbs are frequently used, along with fruits such as plums, pomegranates, quince, prunes, apricots and raisins.': 5.576923076923077,
 'The names of many of the Iranian dishes and culinary terms that have been translated can be seen in Arabic language books.': 0.5982905982905984,
 'The book originally contained 26 chapters, listed by the author in his introduction, but chapters 23 through 26 are missing from the surviving manuscript.': 2.538461538461538,
 'The recipes include measurements for ingredients—often detailed directions for the preparation of dishes, including the types of utensils and pots to be used—and instructions for decorating and serving them.': 1.7008547008547006,
 'In general, the ingredients and their combinations in various recipes 

#### <font color="#fd79a8"> Résumé de l'article

In [14]:
def summarizer(text, num_sentences=10):

# Ordonner les phrases par pondération et recupérer les 10 premières phrases
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=False)[:10]
    # regrouper ensemble les phrases qui ont les poids les plus élévés
    summary = ' '.join(summary_sentences)

    # Afficher le résumé
    return summary_sentences

['The measurements and directions are not as detailed as in the earlier book.',
 'Iranians traditionally put a lump of sugar cube in the mouth before drinking the tea.',
 'The food of southern Iran is typically spicy.',
 'The dolma is then simmered in meat broth or ascallions sweet-and-sour sauce.',
 'Other contemporary cooks and their specialties are also mentioned.',
 'Mahyawa is a tangy sauce made of fermented fish in this region.',
 'Rice is primarily cultivated in the region of Makran.',
 'The names of many of the Iranian dishes and culinary terms that have been translated can be seen in Arabic language books.',
 'Kateh is a method of cooking rice that originates from this region.',
 "Meat and dates are the main ingredients in the cuisine of Iran's southeastern region of Baluchistan."]

## <font color="#F0000"> Text summarization avec sumy

In [15]:
# Importer les packages
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# Créer un text parser utilisant de tokenisation
parser = PlaintextParser.from_string(article_text, Tokenizer('english'))

In [16]:
parser.document.sentences

(<Sentence: Iranian cuisine (Persian: آشپزی ایرانی, romanized: Āshpazī Irānī) are the culinary traditions of Iran.>,
 <Sentence: Due to the historically common usage of the term "Persia" to refer to Iran in the Western world,    it is alternatively known as Persian cuisine, despite Persians being only one of a multitude of Iranian ethnic groups who have contributed to Iran's culinary traditions.>,
 <Sentence: The cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.>,
 <Sentence: Aspects of Iranian cuisine have also been significantly adopted by Indian cuisine and Pakistani cuisine through various historical Persianate sultanates that flourished during Muslim rule on the Indian subcontinent, with the most notable and impactful of these polities being the Mughal Empire.>,
 <Sentence: Ty

### <font color="#48dbfb">  TextRankSummarizer

In [21]:
# Importer le TextRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarizer_textrank(text, num_sentences=5):
# Initialiser le modèle
    summarizer_textrank = TextRankSummarizer()

    # Summariser en 5 phrases
    summary = summarizer_textrank(parser.document, 5)

    # Regrouper les phrases
    text_summary = ""
    for sentence in summary:
        text_summary += str(sentence)

    # Afficher le summary
    return text_summary

### <font color="#48dbfb">  LexRankSummarizer

In [22]:
# Importer LexRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def summarizer_lexrank(text, num_sentences=5):
# Initialiser le modèle
    summarizer_lexrank = LexRankSummarizer()

    # Summariser en 5 phrases
    summary = summarizer_lexrank(parser.document, 5)

    # Regrouper les phrases
    text_summary = ""
    for sentence in summary:
        text_summary += str(sentence)
        
    # Afficher le summary
    return text_summary

### <font color="#48dbfb"> LsaSummarizer

In [23]:
# Importer LsaSummarizer
from sumy.summarizers.lsa import LsaSummarizer

def summarizer_lsa(text, num_sentences=5):
# Initialiser le modèle
    summarizer_lsa = LsaSummarizer()

    # Summariser en 5 phrases
    summary = summarizer_lsa(parser.document, 5)

    # Regrouper les phrases
    text_summary = ""
    for sentence in summary:
        text_summary += str(sentence)

    # Afficher le summary
    return text_summary

### <font color="#48dbfb"> Deploiement sur Streamlite

In [None]:
def set_background(image_file):
    with open(image_file, "Slide01") as f:
        img_data = f.read()

    b64_encoded = base64.b64encode(img_data).decode()
    style = f"""
        <style>
        .stApp {{
            background-image: url(data:image/jpg;base64,{b64_encoded});
            background-size: cover;
        }}
        </style>
    """
    st.markdown(style, unsafe_allow_html=True)


st.title("Application de Summarization")

# Ajoutez des composants Streamlit pour interagir avec l'utilisateur
texte_utilisateur = st.text_area("Entrez le texte à résumer", "")

if st.button("Résumer Modèle TextSummarizer"):
    resultat = summarizer_textrank(texte_utilisateur)
    st.write("Résumé (TextSummarizer):", resultat)

if st.button("Résumer Modèle TextRankSummarizer"):
    resultat = summarizer_textrank(texte_utilisateur)
    st.write("Résumé (TextRankSummarizer):", resultat)

if st.button("Résumer Modèle LexRankSummarizer"):
    resultat = summarizer_lexrank(texte_utilisateur)
    st.write("Résumé (LexRankSummarizer):", resultat)

if st.button("Résumer Modèle LsaSummarizer"):
    resultat = summarizer_lsa(texte_utilisateur)
    st.write("Résumé (LsaSummarizer):", resultat)