In [1]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [2]:
!pip install translatepy

Collecting translatepy
  Downloading translatepy-2.3-py3-none-any.whl.metadata (16 kB)
Collecting safeIO>=1.2 (from translatepy)
  Downloading safeIO-1.2.tar.gz (8.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyuseragents (from translatepy)
  Downloading pyuseragents-1.0.5-py3-none-any.whl.metadata (4.3 kB)
Collecting inquirer>=2.8.0 (from translatepy)
  Downloading inquirer-3.4.0-py3-none-any.whl.metadata (6.8 kB)
Collecting blessed>=1.19.0 (from inquirer>=2.8.0->translatepy)
  Downloading blessed-1.20.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting editor>=1.6.0 (from inquirer>=2.8.0->translatepy)
  Downloading editor-1.6.6-py3-none-any.whl.metadata (2.3 kB)
Collecting readchar>=4.2.0 (from inquirer>=2.8.0->translatepy)
  Downloading readchar-4.2.1-py3-none-any.whl.metadata (7.5 kB)
Collecting runs (from editor>=1.6.0->inquirer>=2.8.0->translatepy)
  Downloading runs-1.2.2-py3-none-any.whl.metadata (10 kB)
Collecting xmod (from editor>=1.6.0->inquirer>=2.8

In [3]:
import pandas as pd

# Charger le fichier traduit
merged_data = pd.read_csv("/content/data_scrapée+lang.csv")

In [4]:
merged_data.isnull().sum()

Unnamed: 0,0
listing_id,0
description,0
comments,3
description_lang,0
comments_lang,0


In [5]:
merged_data.head()

Unnamed: 0,listing_id,description,comments,description_lang,comments_lang
0,3109,Lovely Appartment with one bedroom with a Quee...,Tout s'est bien déroulé. Merci bien. PG,en,fr
1,3109,Lovely Appartment with one bedroom with a Quee...,Un petit nid fouiller douillet situé dans app...,en,fr
2,3109,Lovely Appartment with one bedroom with a Quee...,"Appartement spacieux, propre,clair, et calme à...",en,fr
3,3109,Lovely Appartment with one bedroom with a Quee...,"Appartement totalement rénové, en parfait état...",en,fr
4,5396,"NEW SOFA-BED SINCE JUNE 2023, Please disregard...",Perfect location!! Nasrine was a delight and m...,en,en


In [6]:
merged_data = merged_data.dropna()

In [7]:
merged_data.shape

(9997, 5)

## 🌐 Traduction Automatique Multilingue avec `translatepy`

Ce script automatise la **traduction des descriptions et commentaires non anglophones** en anglais, afin de faciliter les étapes de nettoyage, de similarité et de génération.

---

### 🔍 Objectif
Traduire les colonnes `"description"` et `"comments"` d'un DataFrame `merged_data`, tout en :
- Respectant les codes de langue détectés.
- Effectuant la traduction par lots pour éviter les blocages.
- Ajoutant des colonnes traduites au DataFrame.





In [8]:
import pandas as pd
import time
from translatepy import Translator

translator = Translator()

# Dictionnaire de correction des codes de langue
lang_corrections = {
    "zh-tw": "zh-TW",  # Chinois traditionnel
    "zh-cn": "zh-CN",  # Chinois simplifié
    "iw": "he",        # Hébreu
    "unknown": "auto"  # Détection automatique pour les langues inconnues
}

# Fonction de traduction avec affichage du statut
def batch_translate(text_list, source_lang):
    if not text_list:
        return []

    # Correction du code de langue
    source_lang = lang_corrections.get(source_lang, source_lang)

    translated_texts = []
    for idx, text in enumerate(text_list):
        try:
            result = translator.translate(text, "english", source_lang)
            translated_texts.append(result.result)
            print(f"✅ [{idx + 1}/{len(text_list)}] {source_lang} → en : {text[:50]}... → {result.result[:50]}...")
        except Exception as e:
            print(f"❌ Erreur [{idx + 1}/{len(text_list)}] : {text[:50]}... | Erreur : {e}")
            translated_texts.append(text)  # Conserver le texte original en cas d'erreur
        time.sleep(0.5)  # Pause pour éviter le blocage

    return translated_texts

# Colonnes à traduire
columns_to_translate = ["description", "comments"]

# Taille du batch
batch_size = 10

# Vérification des colonnes dans le dataset
columns_to_translate = [col for col in columns_to_translate if col in merged_data.columns]

# Appliquer la traduction
for col in columns_to_translate:
    lang_col = f"{col}_lang"

    if lang_col in merged_data.columns:  # Vérifier si la colonne de langue existe
        mask = merged_data[lang_col] != "en"  # Filtrer les textes non anglais
        texts_to_translate = merged_data.loc[mask, col].tolist()
        source_langs = merged_data.loc[mask, lang_col].unique().tolist()

        print(f"\n🔹 Traduction de la colonne : {col}")

        translated_texts = []
        for lang in source_langs:
            sub_mask = (merged_data[lang_col] == lang) & mask
            sub_texts = merged_data.loc[sub_mask, col].tolist()

            if sub_texts:
                print(f"\n🟢 Langue détectée : {lang} - {len(sub_texts)} phrases")
                for i in range(0, len(sub_texts), batch_size):
                    print(f"   ⏳ Batch {i + 1}/{len(sub_texts) // batch_size + 1} en cours...")
                    translated_sub_texts = batch_translate(sub_texts[i:i+batch_size], lang)
                    translated_texts.extend(translated_sub_texts)
                    time.sleep(1)

        # Mettre à jour le DataFrame avec les textes traduits
        merged_data.loc[mask, f"{col}_translated"] = translated_texts





🔹 Traduction de la colonne : description

🟢 Langue détectée : so - 100 phrases
   ⏳ Batch 1/11 en cours...
✅ [1/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [2/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [3/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [4/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [5/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [6/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Renovation ...
✅ [7/10] so → en : MODERN AND STYLISH APARTMENT, COMPLETE RENOVATION ... → Modann and stylish apartment, complete Reno

In [9]:
import pandas as pd



# Vérifier que les colonnes existent
if "description_translated" in merged_data.columns and "comments_translated" in merged_data.columns:

    # Trier le DataFrame par la colonne des descriptions pour regrouper les éléments similaires
    merged_data = merged_data.sort_values(by="description_translated").reset_index(drop=True)

    # Sauvegarder le fichier trié
    output_file = "merged_data_grouped_sorted.xlsx"
    merged_data.to_excel(output_file, index=False)

    print(f"📂 Fichier trié et regroupé visuellement enregistré sous : {output_file}")

    # Affichage des résultats
    from IPython.display import display
    # Afficher le DataFrame directement dans l'interface
    display(merged_data)

else:
    print("❌ Les colonnes 'description_translated' et 'comments_translated' sont introuvables dans le fichier.")


📂 Fichier trié et regroupé visuellement enregistré sous : merged_data_grouped_sorted.xlsx


Unnamed: 0,listing_id,description,comments,description_lang,comments_lang,description_translated,comments_translated
0,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...","A terrific location. \r<br/>Nice , clean and n...",so,en,"Modann and stylish apartment, complete Renovat...",
1,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",The location and the facilities of the apartme...,so,en,"Modann and stylish apartment, complete Renovat...",
2,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",We were so lucky to find this beautiful apartm...,so,en,"Modann and stylish apartment, complete Renovat...",
3,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Wonderful apartment in the heart of Paris - on...,so,en,"Modann and stylish apartment, complete Renovat...",
4,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Excellent location; just steps away from amazi...,so,en,"Modann and stylish apartment, complete Renovat...",
...,...,...,...,...,...,...,...
9992,167998,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he...",en,en,,
9993,167998,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...,en,en,,
9994,167998,"A local neighbourhood filled with history, res...",Louise的房子地点简直太赞了！街对面就是几家很小资开到很晚的餐馆和时尚服装店，走出街就是...,en,zh-cn,,"Superb apartment, very well placed to discover..."
9995,167998,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per...",en,en,,


In [10]:
print(merged_data['description_translated'])

0       Modann and stylish apartment, complete Renovat...
1       Modann and stylish apartment, complete Renovat...
2       Modann and stylish apartment, complete Renovat...
3       Modann and stylish apartment, complete Renovat...
4       Modann and stylish apartment, complete Renovat...
                              ...                        
9992                                                  nan
9993                                                  nan
9994                                                  nan
9995                                                  nan
9996                                                  nan
Name: description_translated, Length: 9997, dtype: object


In [11]:
import pandas as pd
from IPython.display import display  # Pour affichage dans Jupyter/Colab

# Vérifier si les colonnes de traduction existent
if 'description_translated' in merged_data.columns and 'comments_translated' in merged_data.columns:

    # 🔹 Remplacer les traductions échouées (NaN ou contenant ❌) par la description originale
    merged_data['description_translated'] = merged_data.apply(
        lambda row: row['description'] if pd.isna(row['description_translated']) or "x" in str(row['description_translated'])
        else row['description_translated'],
        axis=1
    )

    # 🔹 Remplacer les traductions échouées (NaN) dans `comments_translated` par `comments`
    merged_data['comments_translated'] = merged_data.apply(
        lambda row: row['comments'] if pd.isna(row['comments_translated']) or "x" in str(row['comments_translated'])
        else row['comments_translated'],
        axis=1
    )

    print("✅ Les descriptions et commentaires non traduits ont été remplacés par leur version originale.")

    # 🔹 Affichage du DataFrame mis à jour
    display(merged_data)

    # 🔹 Sauvegarde dans un fichier Excel
    output_file = "merged_data_updated.xlsx"
    merged_data.to_excel(output_file, index=False)
    print(f"📂 Fichier sauvegardé sous : {output_file}")

else:
    print("x Les colonnes 'description_translated' ou 'comments_translated' sont introuvables.")



✅ Les descriptions et commentaires non traduits ont été remplacés par leur version originale.


Unnamed: 0,listing_id,description,comments,description_lang,comments_lang,description_translated,comments_translated
0,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...","A terrific location. \r<br/>Nice , clean and n...",so,en,"Modann and stylish apartment, complete Renovat...",
1,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",The location and the facilities of the apartme...,so,en,"Modann and stylish apartment, complete Renovat...",
2,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",We were so lucky to find this beautiful apartm...,so,en,"Modann and stylish apartment, complete Renovat...",
3,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Wonderful apartment in the heart of Paris - on...,so,en,"Modann and stylish apartment, complete Renovat...",
4,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Excellent location; just steps away from amazi...,so,en,"Modann and stylish apartment, complete Renovat...",
...,...,...,...,...,...,...,...
9992,167998,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he...",en,en,,
9993,167998,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...,en,en,,
9994,167998,"A local neighbourhood filled with history, res...",Louise的房子地点简直太赞了！街对面就是几家很小资开到很晚的餐馆和时尚服装店，走出街就是...,en,zh-cn,,"Superb apartment, very well placed to discover..."
9995,167998,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per...",en,en,,


📂 Fichier sauvegardé sous : merged_data_updated.xlsx


In [12]:
import pandas as pd
from IPython.display import display  # Pour affichage dans Jupyter/Colab

# Vérifier si les colonnes de traduction existent
if 'description_translated' in merged_data.columns and 'comments_translated' in merged_data.columns:

    # 🔹 Convertir en string pour éviter les erreurs
    merged_data['description_translated'] = merged_data['description_translated'].astype(str)
    merged_data['comments_translated'] = merged_data['comments_translated'].astype(str)

    # 🔹 Remplacer les descriptions non traduites par la version originale
    merged_data.loc[merged_data['description_translated'].isin(["nan", "NaN", "None", "", " "]), 'description_translated'] = merged_data['description']

    # 🔹 Remplacer les commentaires non traduits par la version originale
    merged_data.loc[merged_data['comments_translated'].isin(["nan", "NaN", "None", "", " "]), 'comments_translated'] = merged_data['comments']

    print("✅ Les descriptions et commentaires non traduits ont été correctement remplacés.")

    # 🔹 Affichage du DataFrame mis à jour
    display(merged_data)

    # 🔹 Sauvegarde dans un fichier Excel
    output_file = "merged_data_updated.xlsx"
    merged_data.to_excel(output_file, index=False)
    print(f"📂 Fichier sauvegardé sous : {output_file}")

else:
    print("❌ Les colonnes 'description_translated' ou 'comments_translated' sont introuvables.")



✅ Les descriptions et commentaires non traduits ont été correctement remplacés.


Unnamed: 0,listing_id,description,comments,description_lang,comments_lang,description_translated,comments_translated
0,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...","A terrific location. \r<br/>Nice , clean and n...",so,en,"Modann and stylish apartment, complete Renovat...","A terrific location. \r<br/>Nice , clean and n..."
1,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",The location and the facilities of the apartme...,so,en,"Modann and stylish apartment, complete Renovat...",The location and the facilities of the apartme...
2,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",We were so lucky to find this beautiful apartm...,so,en,"Modann and stylish apartment, complete Renovat...",We were so lucky to find this beautiful apartm...
3,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Wonderful apartment in the heart of Paris - on...,so,en,"Modann and stylish apartment, complete Renovat...",Wonderful apartment in the heart of Paris - on...
4,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Excellent location; just steps away from amazi...,so,en,"Modann and stylish apartment, complete Renovat...",Excellent location; just steps away from amazi...
...,...,...,...,...,...,...,...
9992,167998,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he...",en,en,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he..."
9993,167998,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...,en,en,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...
9994,167998,"A local neighbourhood filled with history, res...",Louise的房子地点简直太赞了！街对面就是几家很小资开到很晚的餐馆和时尚服装店，走出街就是...,en,zh-cn,"A local neighbourhood filled with history, res...","Superb apartment, very well placed to discover..."
9995,167998,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per...",en,en,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per..."


📂 Fichier sauvegardé sous : merged_data_updated.xlsx


In [19]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=233d9084283b83ef71344585016ffc07e9c65bac054079bef5f6fdd5121ebce0
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [20]:
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from IPython.display import display  # Pour affichage dans Jupyter/Colab

# Fonction pour détecter la langue d'un texte (avec gestion des erreurs)
def detect_language(text):
    try:
        return detect(text) if pd.notna(text) and isinstance(text, str) else "unknown"
    except LangDetectException:
        return "unknown"

# 🔹 Appliquer la détection de langue sur les colonnes traduites
merged_data['description_translated_lang'] = merged_data['description_translated'].apply(detect_language)
merged_data['comments_translated_lang'] = merged_data['comments_translated'].apply(detect_language)

# 🔹 Filtrer pour ne garder que les traductions en anglais
merged_data = merged_data[(merged_data['description_translated_lang'] == "en") & (merged_data['comments_translated_lang'] == "en")].reset_index(drop=True)

print("✅ Seules les descriptions et commentaires traduits en anglais sont conservés.")

# 🔹 Affichage du DataFrame mis à jour
display(merged_data)

# 🔹 Sauvegarde dans un fichier Excel
output_file = "merged_data_english_only.xlsx"
merged_data.to_excel(output_file, index=False)
print(f"📂 Fichier sauvegardé sous : {output_file}")


✅ Seules les descriptions et commentaires traduits en anglais sont conservés.


Unnamed: 0,listing_id,description,comments,description_lang,comments_lang,description_translated,comments_translated,description_translated_lang,comments_translated_lang
0,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...","A terrific location. \r<br/>Nice , clean and n...",so,en,"Modann and stylish apartment, complete Renovat...","A terrific location. \r<br/>Nice , clean and n...",en,en
1,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",The location and the facilities of the apartme...,so,en,"Modann and stylish apartment, complete Renovat...",The location and the facilities of the apartme...,en,en
2,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",We were so lucky to find this beautiful apartm...,so,en,"Modann and stylish apartment, complete Renovat...",We were so lucky to find this beautiful apartm...,en,en
3,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Wonderful apartment in the heart of Paris - on...,so,en,"Modann and stylish apartment, complete Renovat...",Wonderful apartment in the heart of Paris - on...,en,en
4,93287,"MODERN AND STYLISH APARTMENT, COMPLETE RENOVAT...",Excellent location; just steps away from amazi...,so,en,"Modann and stylish apartment, complete Renovat...",Excellent location; just steps away from amazi...,en,en
...,...,...,...,...,...,...,...,...,...
9053,167998,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he...",en,en,"A local neighbourhood filled with history, res...","Louise, our host was fabulous, friendly and he...",en,en
9054,167998,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...,en,en,"A local neighbourhood filled with history, res...",Louise was a wonderful host and promptly showe...,en,en
9055,167998,"A local neighbourhood filled with history, res...",Louise的房子地点简直太赞了！街对面就是几家很小资开到很晚的餐馆和时尚服装店，走出街就是...,en,zh-cn,"A local neighbourhood filled with history, res...","Superb apartment, very well placed to discover...",en,en
9056,167998,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per...",en,en,"A local neighbourhood filled with history, res...","Louise's flat is beautiful, chic and has a per...",en,en


📂 Fichier sauvegardé sous : merged_data_english_only.xlsx


**j'ai enregistré la data pour l utiliser après dans le notebook Annotation+ pretraitement des donnée+ cosine similarity**