# PRJ2 Assurance
by: Ruben LEON, Louis-Melchior GIRAUD

## Preprocess

In [1]:
import pandas as pd
import numpy as np      # for the rest of the nbk
import os 

# Load data
dfs = []
data_folder_path = os.path.join('data', 'assurance')
for file_name in os.listdir(data_folder_path):
    path = os.path.join(data_folder_path, file_name)
    new_df = pd.read_excel(path)
    dfs.append(new_df)
df = pd.concat(dfs, ignore_index=True)

# Drop usless colums & problematic lines
drop_colmun = [
    'auteur', 'produit', 'date_publication', 'date_exp',
    'avis_cor', 'avis_cor_en']
df.drop(drop_colmun, axis=1, inplace=True)
df.dropna(subset=['avis_en'], inplace=True)
df['note'] = df['note'].fillna(0).astype(int)

# Rename columns
df.rename(columns={'avis': 'avis_fr', 'assureur': 'insurer'}, inplace=True)
df = df.reindex(columns=['insurer', 'avis_fr', 'avis_en','note', 'type']) # no agr inplace

# Split df
df_train = df[df['type'] == 'train'].drop(columns=['type'])         # 24103 lines
df_test = df[df['type'] == 'test'].drop(columns=['type', 'note'])   # 10 330 lines

df_train.head()

Unnamed: 0,insurer,avis_fr,avis_en,note
2000,Direct Assurance,"Meilleurs assurances, prix, solutions, écoute,...","Best insurance, price, solutions, listening, s...",5
2001,Direct Assurance,"je suis globalement satisfait , sauf que vous ...","I am generally satisfied, except that you have...",4
2002,Direct Assurance,Prix tres abordable plusieurs options s'offren...,Very affordable price Several options are avai...,5
2003,L'olivier Assurance,"je satisfait du service, une réponse très rapi...","I satisfy the service, a very fast response fr...",4
2004,Matmut,"Client depuis plus de 25 ans, très déçu de cet...","Customer for more than 25 years, very disappoi...",1


In [2]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24103 entries, 2000 to 33434
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   insurer  24103 non-null  object
 1   avis_fr  24103 non-null  object
 2   avis_en  24103 non-null  object
 3   note     24103 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 941.5+ KB


In [3]:
lang = 'en' # ['fr', 'en]
nb_samples = 100

# Define labels (last item must be 'Others')
labels_fr= ["Prix", "Couverture", "Inscription", "Service Client", "Traitement des Sinistres", "Annulation", "Autres"]
labels_en=["Pricing", "Coverage", "Enrollment", "Customer Service", 'Claims Processing', 'Cancellation', 'Others']

# Choose language
avis = 'avis_fr' if lang == 'fr' else 'avis_en'
labels = labels_fr if lang == 'fr' else labels_en

# Get the reviews
df_sample = df_train.sample(n=nb_samples, random_state=42)

In [4]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device:', device)

device: cuda


# Exo 1: Prediction Application

- **Objective**: Predict the number of stars or the main subject of the review.
- **Features**:
    - User inputs a review.
    - Model predicts the star rating and/or category (e.g., "Pricing" or "Customer Service").
    - Display results in real-time.
- **Implementation Steps**:
    - Use a pre-trained text classification model for stars.
    - Implement a zero-shot or fine-tuned classifier for subjects.

## Sentiment Analysis

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

senti_model_name = "tabularisai/multilingual-sentiment-analysis"
senti_tokenizer = AutoTokenizer.from_pretrained(senti_model_name)
senti_model = AutoModelForSequenceClassification.from_pretrained(
    senti_model_name).to(device)
senti_model = senti_model.eval()

# sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
def sentiment_pipeline(texts):
    inputs = senti_tokenizer(
        texts, return_tensors="pt", truncation=True, padding=True, max_length=512
    ).to(device)
    
    with torch.no_grad():
        outputs = senti_model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment = torch.argmax(probabilities, dim=-1).tolist()
        
    senti_rescale = [senti+1 for senti in sentiment]    # convert to 1-5 scale like stars
    return senti_rescale

In [11]:
reviews = df_sample[avis].tolist()
stars = sentiment_pipeline(reviews)
dist = [np.abs(score - gt_score) for score, gt_score in zip(stars, df_sample['note'])]
avg_dist = np.mean(dist)

print('Average distance:', avg_dist)

Average distance: 0.79


## Classifier

For review larger than max_length
- split_long_reviews
    1. Split review in smaller chunks
    2. Store the chunks with the other reviews
    3. Keep a log of it original index in id_map
- process data
- merdge_splited_review  
    1. Merdge chunks with the same index in id_map  
    2. Average the scores of the chunks to get the final score

In [6]:
def split_long_reviews(reviews, max_length=500):
    split_reviews, id_map = [], []
    
    for i, raw in enumerate(reviews):
        
        if len(raw) > max_length:
            chunk_list, chunk = [], ''
            
            # Split reviews in chunk smaller than max_length
            for s in raw.split(' '):
                if len(chunk + s) < max_length:
                    chunk += s + ' '
                else:
                    chunk_list.append(chunk)
                    chunk = s + ' '
            chunk_list.append(chunk)

            split_reviews.extend(chunk_list)
            id_map.extend([i] * len(chunk_list))

        else:
            split_reviews.append(raw)
            id_map.append(i)

    return split_reviews, id_map

In [7]:
def save_state(output, subject, score):
    output['subject'].append(subject)   
    output['score'].append(score)
    return output

def reset_state(id, resp):
    current_id = id
    current_sbj = [resp[id]['labels']]
    current_scr = [resp[id]['scores']]

    return current_id, current_sbj, current_scr

def merdge_splited_review(resp, id_map):
    output = {'subject': [], 'score': []}
    current_state = 0, [], [] # id, subject, score

    for id in id_map + [-1]:    # add -1 to save last state
        current_id, current_sbj, current_scr = current_state

        if current_id == id:
            current_sbj.append(resp[id]['labels'])
            current_scr.append(resp[id]['scores'])
        
        # one item => review not splited
        # take current state first item, return best element
        elif len(current_sbj) == 1:
            output = save_state(output, current_sbj[0][0], current_scr[0][0])
            current_state = reset_state(id, resp)

        else: 
            # get a dict {'subject' : [score_i, score_i+1, ...]}
            avg_dict = {sbj : [] for sbj in current_sbj[0]}
            for sbj_list, scr_list in zip(current_sbj, current_scr):
                for i in range(len(sbj_list)):
                    avg_dict[sbj_list[i]].append(scr_list[i])

            # get subject with the best avg score
            for sbj in avg_dict.keys():
                avg_dict[sbj] = np.mean(avg_dict[sbj])
            
            # save
            best_subject = max(avg_dict, key=avg_dict.get)
            output = save_state(output, best_subject, avg_dict[best_subject])
            current_state = reset_state(id, resp)


    return output['subject'], output['score']

In [8]:
from transformers import pipeline
from numpy import argmax

classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-deberta-v3-base',
    use_fast = False, device=device
)

def subject_pipeline(reviews, lang='fr', threshold=0.5):
    # Pre-process
    labels = labels_fr if lang == 'fr' else labels_en
    split_reviews, id_map = split_long_reviews(reviews, max_length=500)

    # Run model
    resp = classifier(split_reviews, labels[:-1])   # exclude 'Others' label

    # Post process
    subjects, scores = merdge_splited_review(resp, id_map)
    for i in range(len(scores)): # Classify as 'Others' under thresholds
        if scores[i] <= threshold:
            subjects[i] = labels[-1]    

    return subjects

2025-01-13 12:32:04.945664: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-13 12:32:04.945749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-13 12:32:04.947290: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-13 12:32:04.956187: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [206]:
# Run model zero-shot-classification
reviews = df_sample[avis].tolist()
subjects = subject_pipeline(reviews, lang)

# Print
print('Language:', lang)
for l in labels:
    print('\t-', subjects.count(l), l)

print('\n')
for subject, text in zip(subjects, df_sample[avis]):
    print(f"Review: {text}")
    print(f"Subject: {subject}")
    print("---")

Language: fr
	- 14 Prix
	- 1 Couverture
	- 7 Inscription
	- 33 Service Client
	- 3 Traitement des Sinistres
	- 19 Annulation
	- 56 Autres


Review: Déçue après un accident domestique.
Les deux gestionnaires n'ont pas fait leur boulot, l'expert accordé 1 mois d'aide humaine et lorsque je demande l'indemnisation  0
Alors je faire des devis ailleurs 
Subject: Annulation
---
Review: très bons services - personnel très agréable et compétant sachant répondre aux diverses questions - rapport qualité prix très satisfaisant- je suis ravie d'avoir changé de fournisseur

Subject: Service Client
---
Review: Ok pour le service prix compétitif j espère que tout va se faire rapidement et avoir ma carte verte le plus vite Possible je vous ai connu par rapport à ma petite sœur qui est assuré chez vous
Subject: Autres
---
Review: Injoignables n'écoutent rien, mais aiment en revanche s'écouter parler. Ne justifient par leur hausses de prix
Subject: Annulation
---
Review: Bonjour j'ai vendu mon véhicule s

# Exo 2: Insurer Analysis Application

- Objective: Provide insights into insurer performance based on reviews.
- Features:
    1. Summary by Insurer: Aggregate reviews and generate a summary using NLP techniques.
    2. Review Search: Enable search functionality for specific reviews using keywords or filters (e.g., star ratings, subjects).
    3. Metrics:
        - Average star rating by insurer.
        - Average star rating by subject for each insurer (e.g., "Pricing: 3.2/5").
- Implementation Steps:
    1. Preprocess reviews to structure them by insurer and subjects.
    2. Generate summaries using text summarization models (e.g., T5, GPT-3).
    3. Reuse code from Project 1 for implementing the search functionality.
    4. Compute and visualize metrics using Streamlit widgets like bar charts and tables

## Summaries by Insuer

In [46]:
def dataframe_filter(df_input, insurers=None, labels=None, stars=None):
    df_filter = df_input.copy()
    filter_dict = {'insurer': insurers, 'label': labels, 'star': stars }
    
    for filter_name, filter in filter_dict.items():
        if filter is not None:                                                  # if str or [str]
            filter = [filter] if type(filter) == str else filter    # convert str to [str]
            df_filter = df_filter[df_filter[filter_name].isin(filter)]
    
    return df_filter

In [35]:
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization", 
    device=device
)

def join_reviews(reviews, length_max=1000):
    output, current_review = [], ''
    
    for text in reviews:
        candidat_review = current_review + ' ' + text
        candidat_review = candidat_review.replace('\n', '')
        candidat_review = candidat_review.replace(' . ', '. ')

        # words != tokens but it's an easy eaistimation
        if candidat_review.count(' ') < length_max:
            current_review = candidat_review

        elif current_review.count(' ') < length_max:
            output.append(current_review)
            current_review = text
        
        else: # previous text too long on elif, do reccursion
            chunks = join_reviews(current_review, length_max)
            output.extend(chunks)
            current_review = text

    output.append(current_review)
    return output


def summarize_reviews(summary_text, lang='fr', input_length_max=512, output_length_max=300):
    nb_loop = 0
    print(f'Original length : {len(summary_text)} lines')
    print('Nb loop \t Summary length')
    
    # Summarize d anmerdge until we have only one summary
    while len(summary_text) > 1:
        # Merdge the text in chunks of input_length_max words
        summary_merdge = join_reviews(summary_text, input_length_max)
        
        # Summurize this chunks in text between[output_length_max, min_length] words
        summary_resp = summarizer(summary_merdge, 
            max_length=output_length_max, 
            min_length=50, 
            do_sample=False
        )
        summary_text = [r['summary_text'] for r in summary_resp]

        # Print to follow advencement
        nb_loop += 1
        print(nb_loop, '\t\t', len(summary_text))
        
    return summary_text[0]

In [41]:
insuer_list = df['insurer'].unique()
print(insuer_list)

["L'olivier Assurance" 'Direct Assurance' 'GMF' 'Pacifica' 'Matmut'
 'Néoliane Santé' 'APRIL' 'SantéVet' 'Mercer' 'Generali' 'Allianz'
 'APRIL Moto' 'Cegema Assurances' 'LCL' 'Afer' 'SwissLife' 'MAAF'
 'Solly Azar' 'AMV' 'CNP Assurances' 'MAIF' 'Sogecap' 'Harmonie Mutuelle'
 'Mutuelle des Motards' 'MACIF' 'Eurofil' 'Active Assurances' 'AXA'
 'Sogessur' 'Ag2r La Mondiale' 'Mgen' "Zen'Up" 'MGP' 'Intériale'
 'Génération' 'Cardif' 'Santiane' 'Eca Assurances' 'Groupama'
 "Assur O'Poil" 'MMA' 'MetLife' 'Crédit Mutuel' 'Afi Esca' 'Gan'
 'Magnolia' 'Suravenir' 'Assur Bon Plan' 'AssurOnline' 'Carac' 'Mapa'
 'Malakoff Humanis' 'Euro-Assurance' 'Peyrac Assurances' 'Hiscox' 'Sma']


In [112]:
issuer_summary = summarize_reviews(summary_merdge, input_length_max=500, output_length_max=100)

print('---')
print(issuer_summary)

Original length : 48 lines
Nb loop 	 Summary length
1 		 102
2 		 10
3 		 1
---
La MGEN, malgré nos nombreuses relances, empêchant notre mutuelle actuelle de nous rembourser nos soins . Aucune réponse claire voire aucune et finalement s'ils disent vrai, ce que je conteste, un très faible remboursement pour des frais dentaires . Je veux faire venir ces enfants en France, j'attends toujours leur réponse .


## Apply on df_test

In [6]:
!nvidia-smi

Sun Jan 12 22:01:15 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 5000     Off  | 00000000:00:05.0 Off |                  Off |
| 33%   39C    P2    36W / 230W |   2435MiB / 16384MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
# Change language 'fr' or 'en'
lang = 'fr'

# Choose language
avis = 'avis_fr' if lang == 'fr' else 'avis_en'
labels = labels_fr if lang == 'fr' else labels_en

In [10]:
from tqdm.notebook import tqdm_notebook as tqdm

chunk_size = 300

for chunk_idx in tqdm(range(0, len(df_test), chunk_size), desc="Processing"):
    # Get a sample of the df
    end_idx = min(chunk_idx + chunk_size - 1, len(df_test))
    df_sample = df_test.iloc[chunk_idx:end_idx]
    reviews = df_sample[avis].tolist()
    
    # Apply the pipeline on it
    star = sentiment_pipeline(reviews)
    label = subject_pipeline(reviews, lang=lang)

    # Update the df
    review_id = df_sample.index # can't modify with iloc
    df_test.loc[review_id, 'star'] = star
    df_test.loc[review_id, 'label'] = label

df_test.head()

Processing:   0%|          | 0/35 [00:00<?, ?it/s]



Unnamed: 0,insurer,avis_fr,avis_en,star,label
0,L'olivier Assurance,Je suis pour le moment satisfait du service. J...,I am currently satisfied with the service. I a...,4.0,Autres
1,L'olivier Assurance,"Que du personnel au téléphone, ce qui explique...","That staff on the phone, which explains the pr...",5.0,Prix
2,L'olivier Assurance,Un rapport qualité prix très intéressant ! Pet...,A very interesting value for money! Little dow...,4.0,Service Client
3,L'olivier Assurance,"Service très pratique et rapide, le service cl...","Very practical and fast service, customer serv...",5.0,Service Client
4,L'olivier Assurance,Je suis satisfait du service obtenu ! L'accuei...,I am satisfied with the service obtained! The ...,4.0,Service Client


In [11]:
df_prepro_path = os.path.join('data', 'df_assurance_fr_prepro.csv')
df_test.to_csv(df_prepro_path, index=False)
#df_test = pd.read_csv(df_prepro_path)
df_test.columns

Index(['insurer', 'avis_fr', 'avis_en', 'star', 'label'], dtype='object')