In [1]:
!pip install torch transformers sentence_transformers spacy>=3.5 gradio
!python -m spacy download fr_core_news_md

2023-02-24 10:15:27.959223: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-24 10:15:28.967536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-02-24 10:15:28.967658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

# import all the necessary dependent files

In [3]:
import requests
from pathlib import Path

if Path('summarizer.py').is_file():
    print('already exists')
else:
    print('not existed yet')
    request = requests.get("https://huggingface.co/spaces/Carlosito16/HXM-summarization/raw/main/app.py")
    with open('summarizer.py', "wb") as f:
        f.write(request.content)


if Path('helper_function.py').is_file():
    print('already exists')
else:
    print('not existed yet')
    request = requests.get("https://huggingface.co/spaces/Carlosito16/HXM-summarization/raw/main/helper_function.py")
    with open('helper_function.py', "wb") as f:
        f.write(request.content)


if Path('french_stopword.txt').is_file():
    print('already exists')
else:
    print('not existed yet')
    request = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt")
    with open('french_stopword.txt', "wb") as f:
        f.write(request.content)

already exists
already exists
already exists


In [4]:
import torch
from transformers import RobertaTokenizerFast, EncoderDecoderModel, AutoTokenizer, AutoModelForSeq2SeqLM

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import requests
from pathlib import Path


In [5]:
class keyWordExtractor():
    
    def __init__(self, 
                 article_text,
                 similarity_model,
                 n_gram = 1,
                 top_n = 3,
                 french_stopwords = None,
                 ner= None,
                 ):
        self.article_text = article_text
        self.french_stopwords = french_stopwords
        self.candidates = self.count_vectorizer(n_gram)
        self.noun_candidates, self.proper_noun_candidates = self.slice_only_noun_token(ner, self.candidates)
        self.top_n_keywords = self.top_n_extractor(similarity_model, top_n)
    
    def count_vectorizer(self, n_gram):
        n_gram_range = (n_gram, n_gram)
        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range,
                        stop_words = self.french_stopwords).fit([self.article_text]) #Main change
        candidates = count.get_feature_names_out()

        return candidates

    def slice_only_noun_token(self, ner, token_list):
        """
        Given the tokenized list, this function returns only the "NOUN" token
            Args:
                ner (spacy): The NER class to detect the `token.pos_`
                token_list (list): List of token from the full article
            Returns:
                slice_list (list): List of token containing only "NOUN" part of speech
        """

        noun_slice_list = []
        proper_noun_slice_list = []
        for word_idx in range(len(token_list)):
            doc = ner(token_list[word_idx])

            for token in doc:
                if token.pos_ == 'NOUN':
                    noun_slice_list.append(token.text)
                elif token.pos_ == 'PROPN':
                    proper_noun_slice_list.append(token.text)

        return noun_slice_list, proper_noun_slice_list

    def top_n_extractor(self, model, top_n):
        doc_embedding = model.encode([self.article_text])
        candidate_embeddings = model.encode(self.noun_candidates)
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [self.noun_candidates[index] for index in distances.argsort()[0][-top_n:]]

        return keywords

def clear_input():
    return ("", "")

       
def camembert_generate_summary(article_text):
   inputs = cmb_tokenizer([article_text], padding="max_length", truncation=True,
                          max_length=50,
                          return_tensors="pt")
   input_ids = inputs.input_ids.to(device)
   attention_mask = inputs.attention_mask.to(device)
   output = cmb_model.generate(input_ids, attention_mask=attention_mask, max_length = 50)
   return cmb_tokenizer.decode(output[0], skip_special_tokens=True)

    
def t5_generate_summary(article_text):
    input_ids = t5_tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512)["input_ids"]
    
    output_ids = t5_model.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]
    
    output = t5_tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
    )

    return output

    
def summarizer(dropdown_model, article_text):
    """
    Ruturs a summarized version from the full article based on the selected pretrained-model
    """

    if dropdown_model == 'camembert':
        summary = camembert_generate_summary(article_text)

    elif dropdown_model == 'T5':
        summary = t5_generate_summary(article_text)

    return summary
    
def extract_top_3(article):
    nlp = spacy.load("fr_core_news_md")
    # model = SentenceTransformer("dangvantuan/sentence-camembert-large") #

    a= keyWordExtractor(article,
                        n_gram = 1, 
                        top_n = 3,
                        ner = nlp,
                        similarity_model = model)
    keyword = ", ".join(a.top_n_keywords) #to return ['a' , 'b'] >> "a, b"
    proper_nonuns = ", ".join(a.proper_noun_candidates)

    return keyword, proper_nonuns


def runall(dropdown_model, article_text):
    summary = summarizer(dropdown_model, article_text)
    keywords, proper_n = extract_top_3(article_text)

    return summary, keywords, proper_n

---

# Let's stall calling the model

In [6]:
#set the device agnostics code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
test_article ="""\"Un nuage de fumée juste après l’explosion, le 1er juin 2019. Une déflagration dans une importante usine d’explosifs du centre de la Russie a fait au moins 79 blessés samedi 1er juin. L’explosion a eu lieu dans l’usine Kristall à Dzerzhinsk, une ville située à environ 400 kilomètres à l’est de Moscou, dans la région de Nijni-Novgorod. « Il y a eu une explosion technique dans l’un des ateliers, suivie d’un incendie qui s’est propagé sur une centaine de mètres carrés », a expliqué un porte-parole des services d’urgence. Des images circulant sur les réseaux sociaux montraient un énorme nuage de fumée après l’explosion. Cinq bâtiments de l’usine et près de 180 bâtiments résidentiels ont été endommagés par l’explosion, selon les autorités municipales. Une enquête pour de potentielles violations des normes de sécurité a été ouverte. Fragments de shrapnel Les blessés ont été soignés après avoir été atteints par des fragments issus de l’explosion, a précisé une porte-parole des autorités sanitaires citée par Interfax. « Nous parlons de blessures par shrapnel d’une gravité moyenne et modérée », a-t-elle précisé. Selon des représentants de Kristall, cinq personnes travaillaient dans la zone où s’est produite l’explosion. Elles ont pu être évacuées en sécurité. Les pompiers locaux ont rapporté n’avoir aucune information sur des personnes qui se trouveraient encore dans l’usine."""


cmb_ckpt = 'mrm8488/camembert2camembert_shared-finetuned-french-summarization'
cmb_tokenizer = RobertaTokenizerFast.from_pretrained(cmb_ckpt)
cmb_model = EncoderDecoderModel.from_pretrained(cmb_ckpt).to(device)


Downloading (…)okenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/559M [00:00<?, ?B/s]

The following encoder weights were not tied to the decoder ['roberta/pooler']
The following encoder weights were not tied to the decoder ['roberta/pooler']


In [7]:
print(device)

cuda


In [None]:
print(summarizer("camembert", test_article))

Un nuage de fumée juste après l’explosion, le 1er juin 2018, d’une explosion dans une usine d’explosifs russe a fait au moins 79 blessés.


## Also downlaod the spaCy

In [8]:
nlp = spacy.load("fr_core_news_md")
model =  SentenceTransformer("dangvantuan/sentence-camembert-large")

print(extract_top_3(test_article))

Downloading (…)7a9a1/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)0ad4b7a9a1/README.md:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

Downloading (…)d4b7a9a1/config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]



('explosifs, explosion, déflagration', 'moscou, novgorod, russie')


In [None]:
summary, keywords, proper_n = runall("camembert",  test_article)

In [None]:
proper_n

'moscou, novgorod, russie'

# Download that manuallly labeled we have aorund 270 rows

In [9]:
df = pd.read_csv("selected_df - All.csv",
                 header = [1])
df.head()

Unnamed: 0,index,titles,ratings,reviewer,reviews,english,dates,replies,reply_dates,clean_review,...,Carrefour,Other brand,reimbursement,clean_BE,clean_PD,clean_DM,clean_AS,summary,top_3,proper_n
0,1,Mais ou est ma commande???,1,todo,"Commande N° 609249735,toujours non livrée,pass...",,2023-02-07T17:39:04.000Z,"Bonjour, nous vous informons que nous avons pr...",2023-02-08T09:48:06.000Z,,...,,,,,,,,,,
1,2,Cela fait plusieurs semaines même plus…,1,todo,Magasin de Carrefour Trans en Provence 83\nCel...,,2023-02-06T14:48:53.000Z,,,,...,,,,,,,,,,
2,3,Commande Deliveroo annulé !,1,todo,Commande Deliveroo passé ce jour sans suite ca...,,2023-02-08T10:54:56.000Z,"Bonjour, Pouvez-vous nous communiquer vos prén...",2023-02-08T10:17:23.000Z,,...,,,,,,,,,,
3,4,une honte cette société qu'utilise…,1,todo,une honte cette société qu'utilise Carrefour p...,,2023-02-04T18:29:49.000Z,"Bonjour, pourriez-vous nous communiquer votre ...",2023-02-05T08:17:54.000Z,,...,,,,,,,,,,
4,5,Nous sommes passer en caisse N,1,todo,Nous sommes passer en caisse N : 12\n28/01/202...,,2023-02-05T14:36:40.000Z,"Bonjour, Pouvez-vous nous préciser le magasin ...",2023-02-06T08:30:25.000Z,,...,,,,,,,,,,


In [11]:
print(df.shape)

(2000, 36)


In [13]:
  # Now let's make sure we select only the reviews that has some reviewers working on
have_reviewers = ['Poon', 'Yves', 'Insaf', 'All']
selected_df = df[df['reviewer'].isin(have_reviewers)]
unselected_df = df[~df['reviewer'].isin(have_reviewers)]

print(selected_df.shape)
print(unselected_df.shape)

(276, 36)
(1724, 36)


In [16]:
selected_df.iloc[0]['combined_reviews']

"Autant l'hyper de ma ville est plus… Autant l'hyper de ma ville est plus complet, autant mon carrefour de proximité....Hyper déçue. Pas de pignons de pain. Aucun bonbon cruelty free sans gélatine de cochon. Aucun fromage végétal. 😡 J'ai complètement commentaire comme je ne sais pas comment vous répondre en tout cas il s'agit du carrefour de Guelmeur à Brest"

In [17]:
summary, top_3, proper_n = runall("camembert",  selected_df.iloc[0]['combined_reviews'])

In [18]:
print(summary)
print(top_3)
print(proper_n)

Autant l'hyper de ma ville est plus... Autant mon carrefour de proximité.
carrefour, bonbon, pain



In [24]:
selected_df.columns

Index(['index', 'titles', 'ratings', 'reviewer', 'reviews', 'english', 'dates',
       'replies', 'reply_dates', 'clean_review', 'combined_reviews',
       'reviews_len', 'Buying experience', 'Digital', 'store', 'service',
       'product not available', 'Product', 'fresh', 'non fresh',
       'value (quality, etc.)', 'price', 'Delivery Mode', 'Drive', 'Delivery',
       'After Sales', 'Carrefour', 'Other brand', 'reimbursement', 'clean_BE',
       'clean_PD', 'clean_DM', 'clean_AS', 'summary', 'top_3', 'proper_n'],
      dtype='object')

In [68]:
pd.set_option('mode.chained_assignment', None)

In [94]:
selected_df.iloc[0] =  copy_selected_df_row

In [99]:
for i in range(3):
    copy_selected_row = selected_df.iloc[i].copy() #need to copy and assign values to avoid Warning
    copy_selected_row['summary'] = 'Im crazy'
    copy_selected_row['top_3']  = ['ss', 'ss']
    copy_selected_row['proper_n']  = ['s', 'ff']
    
    selected_df.iloc[i] = copy_selected_row #assign the result back to the original dataframe

## Main Summarization and Keyword 

In [123]:
for row in tqdm(range(20, 50)):
    copy_selected_row = selected_df.iloc[row].copy() #need to copy and assign values to avoid Warning
    summary, top_3, proper_n = runall("camembert",  copy_selected_row['combined_reviews'])
    copy_selected_row['summary'] = summary
    copy_selected_row['top_3'] = top_3.split()
    copy_selected_row['proper_n'] = proper_n.split()

    selected_df.iloc[row] = copy_selected_row

  0%|          | 0/30 [00:00<?, ?it/s]

RuntimeError: ignored

In [126]:
complete_df = pd.concat([selected_df, unselected_df]).sort_values('index')

In [128]:
complete_df[~complete_df['summary'].isnull()].shape

(52, 36)

In [129]:
complete_df.to_csv('complete.csv')

In [14]:
from tqdm.auto import tqdm
import json

## Read the ongoing labeling if have

In [None]:
with open('/content/summary.json', 'r') as f:
  summary_dict = json.load(f)

In [None]:
summary_dict["0"]['top_3']

['courriels,', 'réponse,', 'patientons']

In [None]:
def save_json(dictname):
    # create json object from dictionary
    json_file = json.dumps(dictname , indent=4)

    # open file for writing, "w" 
    f = open("summary.json","w")

    # write json object to file
    f.write(json_file)

    # close file
    f.close()


# RUN THIS TO START SUMMARIZATION AND EXTRACTION AND KEEP IN JSON

In [None]:
summary_dict = dict()

In [None]:
for i in tqdm(range(34,50)):
    data = df.iloc[i]
    print(data.new_index) 
    if data.new_index in summary_dict: #check first if we already do this row to faster the process
        pass
    else:
        int_index = int(data.new_index) #need to transform this to string, otherwise the json.load does not allow the numpy.
        summary_dict[int_index] = {} #create the new nested dict with new_index as key
        summary, top_3, proper_n = runall("camembert", data.combined_reviews) #use the columns 'combined_reviews' colum
        summary_dict[int_index]['original'] = data.combined_reviews
        summary_dict[int_index]['summary'] = summary
        summary_dict[int_index]['top_3'] = top_3.split() #split to tranform strings into list
        summary_dict[int_index]['proper_n'] = proper_n.split()


save_json(summary_dict)

  0%|          | 0/16 [00:00<?, ?it/s]

34
35


RuntimeError: ignored

In [None]:
save_json(summary_dict)

In [None]:
summary_dict[5]

{'original': "Service de livraison et ou de Drive… Service de livraison et ou de Drive pratique mais qui devient laborieux.Trop de produits manquants facturés aussi bien en livraison à domicile qu'avec le Drive. On court après les sollicitations de remboursements successifs car les demandes ne sont pas traitées. Et quand elles le sont, les virements ne sont pas effectués ou bien ne correspondent pas au montant payé. La cerise sur le gâteau: un Drive non remis car indisponible le jour J débité quand même. Alors que j'ai pris la peine de prévenir le service client de cette déconvenue où l'on m'a assurée du non débit de ma carte bancaire me voilà encore contrainte de faire une demande de remboursement. Dorénavant par gain de temps j'irai faire mes courses moi même.  Le service est pitoyable (indépendamment du contexte actuel) et les démarches chronophages. Un paradoxe compte tenu de toutes les enquêtes de satisfaction reçues!!! Dommage",
 'summary': 'Service de livraison et ou de Drive...

In [None]:
33

In [None]:
save_json(summary_dict)