# Import the modules

In [None]:
# !pip install langdetect
# !pip install googletrans==4.0.0-rc1
# !pip install tqdm
# !pip install sentence-transformers

In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preprocessing

In [3]:
def preprocess_text(text):
    """
    Preprocesses a given text by cleaning, tokenizing, removing stopwords, and lemmatizing.
    
    Parameters:
    - text (str): The input text to preprocess.

    Returns:
    - str: The preprocessed text.
    """
    if not isinstance(text, str):
        return "no meaningful text"

    # Convert to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r"[^\w\s]", " ", text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    if len(tokens) > 5:  # Only remove stopwords for longer texts
        tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = " ".join(tokens)

    # Return original text if preprocessing removes all meaningful content
    if not preprocessed_text.strip():
        return text  # Return original text

    return preprocessed_text

In [4]:
dataset_path = r"books_data/books.csv"
dataset = pd.read_csv(dataset_path)

In [5]:
dataset.head()

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...
2,Harper Lee,The unforgettable novel of a childhood in a sl...,50th Anniversary,Paperback,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,"Modern Library Classics, USA / CAN",Paperback,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...
4,Stephenie Meyer,About three things I was absolutely positive.F...,,Paperback,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...


In [7]:
dataset.drop(columns=['book_format', 'book_authors', 'book_edition', 'book_isbn','book_pages','book_rating','book_rating_count','book_review_count','genres','image_url'], inplace=True)
dataset.head()

Unnamed: 0,book_desc,book_title
0,Winning will make you famous. Losing means cer...,The Hunger Games
1,There is a door at the end of a silent corrido...,Harry Potter and the Order of the Phoenix
2,The unforgettable novel of a childhood in a sl...,To Kill a Mockingbird
3,«È cosa ormai risaputa che a uno scapolo in po...,Pride and Prejudice
4,About three things I was absolutely positive.F...,Twilight


In [8]:
dataset.shape

(54301, 2)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54301 entries, 0 to 54300
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_desc   52970 non-null  object
 1   book_title  54301 non-null  object
dtypes: object(2)
memory usage: 848.6+ KB


In [10]:
dataset.describe()

Unnamed: 0,book_desc,book_title
count,52970,54301
unique,51781,48483
top,"هذه هي طبعة ""دار الفكر - بيروت"" وهي آخر طبعة ع...",1984
freq,38,17


In [11]:
dataset.isnull().sum()

book_desc     1331
book_title       0
dtype: int64

In [12]:
dataset['book_desc'].fillna(dataset['book_title'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['book_desc'].fillna(dataset['book_title'], inplace=True)


In [13]:
dataset.isnull().sum()

book_desc     0
book_title    0
dtype: int64

In [14]:
dataset = dataset.drop_duplicates(subset='book_title', keep='first').reset_index(drop=True)
dataset = dataset.groupby('book_title', as_index=False).agg({
    'book_desc': ' '.join  # Combine blurbs
})
dataset.shape

(48483, 2)

In [15]:
dataset

Unnamed: 0,book_title,book_desc
0,"""Break the Casanova's Heart"" Operation",Operation Break the Casanova's Heart10 Steps t...
1,"""El Aleph"" de Jorge Luis Borges",De los muy pocos manuscritos que se conservan ...
2,"""Evil"" Arabs in American Popular Film: Orienta...","Runner-up, 2006 Arab American National Museum ..."
3,"""Exterminate All the Brutes"": One Man's Odysse...","""Exterminate All the Brutes"" is a searching ex..."
4,"""Humanism - The Whore of Babylon and the Sleep...","���.., Awake thou that sleepest, and arise fro..."
...,...,...
48478,동물 농장,영국 작가의 세계적인 장편소설. 인간에게 착취 당하던 동물들이 인간을 내쫓고 동물농...
48479,신의 탑 1,What do you desire? Fortune? Glory? Power? Rev...
48480,오만과 편견,"Korean translation of ""Pride and Prejudice"""
48481,초조한 마음,인간 본성에 대한 분석이 돋보이는 츠바이크의 유일한 장편!세계적인 전기 작가이자 심...


In [16]:
dataset.isna().sum()

book_title    0
book_desc     0
dtype: int64

# Don't run

In [18]:
import pandas as pd
from time import sleep
from langdetect import detect, DetectorFactory
from googletrans import Translator
from tqdm import tqdm
import re
import random

# Ensure consistent language detection
DetectorFactory.seed = 0

def translate_non_english_rows(dataset, text_columns, retries=3, delay=2):
    """
    Translates non-English rows in the specified text columns to English with retry mechanism and progress bar.
    """
    translator = Translator()

    def is_time_format(text):
        """Returns True if the text matches a time format (e.g., '3:59')."""
        return bool(re.match(r"^\d{1,2}:\d{2}$", text))

    def translate_text(text):
        if not isinstance(text, str) or not text.strip():
            return text  # Return original if text is None, empty, or invalid

        # Skip numeric, time-like (e.g., "3:59"), or non-linguistic texts (e.g., dates, percentages)
        if text.replace(".", "").replace("/", "").isdigit() or "%" in text or is_time_format(text):
            return text

        if text.startswith("#"):  # Skip hashtags
            return text

        # Adding check to skip None type or other invalid entries before translating
        if text is None or text.strip() == "":
            return text

        for attempt in range(retries):
            try:
                lang = detect(text)
                if lang != 'en':  # Detect language
                    return translator.translate(text, dest='en').text
                return text  # Return original if already in English
            except Exception as e:
                if attempt < retries - 1:
                    sleep(delay * (2 ** attempt) + random.uniform(0, 2))  # Exponential backoff with jitter
                else:
                    print(f"Translation failed for text: {text}\nError: {e}")
                    return text  # Return original on repeated failure


    # Combine the columns for translation
    combined_columns = dataset[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Apply the translation function with a progress bar
    tqdm.pandas(desc="Translating texts")
    translated_combined = combined_columns.progress_apply(translate_text)

    # Split the translated text back into the individual columns
    translated_split = translated_combined.str.split(' ', n=1, expand=True)
    dataset[text_columns[0]] = translated_split[0]
    dataset[text_columns[1]] = translated_split[1]

    return dataset

In [20]:
# Apply the translation for both book_title and book_desc
dataset = translate_non_english_rows(dataset=dataset, text_columns=['book_title', 'book_desc'])
print('done')


Translating texts:   5%|▍         | 2190/48483 [04:37<4:38:19,  2.77it/s]

Translation failed for text: Aisyah the True Beauty Sejarah mencatatkan, para sahabat banyak belajar berkenaan masalah agama dan hukum fikah daripada Aisyah. Ini kerana, pengetahuannya sangat luas dalam bidang itu. Aisyah menjadi sumber yang tidak diragukan dalam bidang ilmu pengetahuan, masalah agama serta Quran dan sunah. Di samping itu, beliau diketahui sangat berhati-hati semasa mengulas sebarang masalah peribadi. Di samping mengetahui banyak rahsia peribadi Nabi Muhammad, Aisyah juga mengisahkan banyak hadis yang didengarnya daripada beliau secara langsung.Buku ini adalah karya Sulaiman an-Nadawi, ulama terkemuka dari Universiti Nadwatul Ulama. Beliau adalah guru kepada Syeikh Abul Hasan Ali al-Hasani an-Nadawi. Keistimewaan buku ini terletak pada kajiannya berkenaan pengetahuan serta sejarah hidup Aisyah Ummul Mukminin. Aisyah adalah sumber rujukan intelektual para sahabat terkemuka.Sejarah mencatatkan, para sahabat banyak belajar berkenaan masalah agama dan hukum fikah daripada 

Translating texts:   5%|▍         | 2240/48483 [05:03<8:25:01,  1.53it/s]

Translation failed for text: Al Muwaththa’ Jilid 2 Penulis : Imam Malik bin AnasPenerbit : Pustaka AzzamBiografi Imam MalikSejak lahir diberi nama “Malik” putra Anas Ibnu Malik (panggilan sehari-harinya Abu ‘Amir) al-Ashbahi (nama dinasti raja-raja yang pernah bertahta di Yaman) al-Himyari. Datuk Imam Malik tergolong sahabat besar, reputasi kemiliterannya mencakup seluruh ghazwah yang dipimpin langsung oleh Nabi/Rasulullah SAW selain perang Badar. Anas ayah kandungnya tergolong tabi’in sendor. Beliau adalah seorang di antara keempat pemikul keranda jenazah Khalifah Utsman bin Affan pada malam pemakamannya.Malik dilahirkan pada tahun 93 hijrah dari rahim Ibu yang mengandungnya selama 3 (tiga) tahun. Imam Malik kelak akan dikenal dengan sebutan “Imamu Daril-hijrah”, lantaran lahir dan meninggal serta aktif mengabdikan seluruh karier keulamaannya di Madinah tempat hijrah Rasulullah SAW. Beliau wafat pada tanggal 11 Rabi’ul-awal 179 H. dalam usia 87 tahun yang sebagian besar masa hidupnya 

Translating texts:   7%|▋         | 3236/48483 [07:31<4:59:09,  2.52it/s]

Translation failed for text: Araf Yalnızlık, yabancılık, dil ve zaman üzerine bir roman...Kim gerçek yabancı - bir ülkede yaşayıp başka bir yere ait olduğunu bilen mi, yoksa kendi ülkesinde yabancı hayatı sürüp, ait olacak başka bir yeri de olmayan mı?İsimlerin yabancı memleketlere ayak uydurma sürecinde muhakkak bir şeyler eksilir - bazen bir nokta, bazen bir harf ya da vurgu. Yabancının isminin başına gelenler pişmiş tavuğun olmasa da pişmiş ıspanağın başına gelenlere benzer - ana malzemeye yeni bir tat eklenmesine eklenmiştir de kalıpta gözle görülür bir çekme olmuştur bu arada. Yabancı işte ilk bu fireyi vermeyi öğrenir. Yabancı bir ülkede yaşamının birinci icabı insanın en aşina olduğu şeye, ismine yabancılaşmasıdır.(Arka Kapak)"Ne hoş tesadüf," dedi Debra zoraki bir neşeyle, "çok sevindim. Ne kadar iyi bir ahçı olduğunu biliyorum." Değişik bir hali vardı, okuma grubundakinden çok daha kendine güvenli görünüyordu. "Senin için bir sakıncası yoksa çalışmaya hemen başlayalım, zaman g

Translating texts:  38%|███▊      | 18337/48483 [49:09<21:33:37,  2.57s/it]

Translation failed for text: Kopassus untuk Indonesia Menyamar Pedagang Durian hingga Sniper AmbonIsi buku Kopassus untuk Indonesia yang diluncurkan Kopassus TNI-AD tak sembarangan. Buku dengan desain gaul itu membuka rahasia dapur korps terbaik ketiga di dunia itu, termasuk operasi intelijen bawah tanah. Seperti apa?RIDLWAN HABIB, JakartaWANITA itu bukan tentara. Gaya pakaiannya juga santai. Turun dari mobil New Honda City metalik, dia disambut hormat oleh prajurit Kopassus. “Mbak Esti ini sudah kami anggap bagian dari keluarga,” kata Letkol Farid Makruf yang menyambut Esti di Markas Komando Kopassus, Cijantung, Jakarta Timur Kamis lalu (7/1).Siang itu suasana sekitar Kesatrian Kopassus agak lengang. Sebab, pada jam dinas, semua prajurit sibuk dengan tugas masing-masing. “Sebelum mengenal mereka, saya benar-benar awam dengan dunia militer,” kata Esti yang sengaja berkunjung ke Kopassus untuk menemui Jawa Pos.Nama lengkapnya Erastiani Asikin Natanegara. Bersama penulis lain, Iwan Santo

Translating texts:  45%|████▍     | 21663/48483 [1:07:41<3:00:17,  2.48it/s]

Translation failed for text: Mikael Karvajalka Mikael Karvajalan nuoruus ja merkilliset seikkailut monessa maassa vuoteen 1527 asti kymmenenä kirjana hänen itsensä vilpittömästi kertomina.
Error: 'NoneType' object is not iterable


Translating texts:  57%|█████▋    | 27694/48483 [1:24:02<2:28:48,  2.33it/s]

Translation failed for text: Sadist Çok ünlü bir yazardı, ama bir gün, hayatta kalabilmek için kitap yazması gerekeceğini hiç düşünmemişti...
Error: 'NoneType' object is not iterable


Translating texts:  63%|██████▎   | 30766/48483 [1:29:37<10:11, 28.97it/s]

Translation failed for text: Supernova: Akar Talita LunaKesejatian hidup ada pada batu kerikil yang tertendang ketika kau melangkahmenyusuri jalan. Kesejatian hidup ada pada selembar daun kering yang gugurtertiup angin. Kesejatian hidup ada air susu ibu yang yang merelakan putingpayudaranya diisap oleh bayi manapun. DiVihara Pit Yong Kiong, Pasuruan, di pelabuhan Belawan, di Penang, diBangkok, di Laos, di Golden Triangle, di Cambodia, di Bandung, dimanapun kau hidup.Tapi, dia mungkin tak terlihat pada arus politik yang menyudutkanmu padapilihan kedigdayaan. Dia menyembunyikan diri dari teriakan-teriakan yang menggemakan perubahan. Kesejatian hidup tak memerlukanperubahan, namun juga tak menampiknya. Dia rebah pada semua kesederhanaanyang ada di sekelilingmu. Maka, carilah, dan kamu akan mendapatinya.Ketuklah, maka pintu akan dibukakan bagimu. Mintalah, maka kau akan diberi.Demikianlah Dewi Lestari mewakilkan sebuah upaya pencarian kesejatian hiduppada seorang tokoh bernama Bodhi. Seora

Translating texts:  65%|██████▍   | 31273/48483 [1:31:13<1:53:38,  2.52it/s]

Translation failed for text: Tara de sub fluturi The book could thrill all the readers aware by the economical and moral crisis we face. It is a dystopia where a country in economical crisis cannot function anymore and the government decides to sell it to a consortium of countries. The population loses the houses due to mortgages and the government decides to allocate tents installed in the parks to accommodate the desperate families. The life in such conditions is described and a story of a family living in such conditions is developed. Finally the country is sold and the consortium decides to organize the country as a collector of the sewages, wastage and garbage coming from the rest of the continent’s countries. There is an important political person in the book, who decides to cover all this abjectness with huge colored and perfumed butterflies…The book is a mirror over a world in distress…-----------------------------------------------Andreea Toma (http://andreeaiuliatoma.blogspot

Translating texts:  97%|█████████▋| 47019/48483 [1:52:49<1:07:10,  2.75s/it]

Translation failed for text: ржорзЗржШ ржмрж▓рзЗржЫрзЗ ржпрж╛ржм ржпрж╛ржм ржлрзНрж▓рзНржпрж╛ржкрзЗ рж▓рзЗржЦ ржХрж┐ржЫрзБ ржХржерж╛ржГржорзЗржШ ржмрж▓рзЗржЫрзЗ ржпрж╛ржмрзЛредржЖржХрж╛рж╢рзЗрж░ ржорзЗржШрзЗрж░рж╛ ржХрж┐ ржХржерж╛ ржмрж▓рзЗ? рждрж╛рж░рж╛ ржХрж┐ ржпрзЗрждрзЗ ржЪрж╛рзЯ ржХрзЛржерж╛ржУ? рждрж╛рж░рж╛ ржХрзЛржерж╛рзЯ ржпрзЗрждрзЗ ржЪрж╛рзЯ? ржмрж░рзНрж╖рж╛ржи ржШржи ржХрж╛рж▓рзЛ ржЖржХрж╛рж╢рзЗрж░ ржжрж┐ржХрзЗ рждрж╛ржХрж┐рзЯрзЗ ржЪрж┐рждрзНрж░рж▓рзЗржЦрж╛рж░ рж╣ржарж╛рзО ржПржЗ ржХржерж╛ ржоржирзЗ рж╣рж▓ред ржжрж╢-ржмрж╛рж░ ржмржЫрж░рзЗрж░ ржХрж┐рж╢рзЛрж░рзАрж░ ржоржирзЗ ржПрж░ рж░ржХржо ржПржХржЯрж╛ ржЪрж┐ржирзНрждрж╛ ржЖрж╕рждрзЗ ржкрж╛рж░рзЗ, ржЪрж┐рждрзНрж░рж▓рзЗржЦрж╛рж░ ржмрзЯрж╕ ржкржБржЪрж┐рж╢ред ржП рж░ржХржо ржЙржжрзНржнржЯ рждрж╛рж░ ржЬржирзНржпрзЗ рж╕рзНржмрж╛ржнрж╛ржмрж┐ржХ ржирзЯред рждржмрзБржУ ржХрзЗржи ржЬрж╛ржирж┐ ржирж┐ржЬрзЗржХрзЗ рждрж╛рж░ ржорзЗржШрзЗрж░ ржорждрзЛ ржоржирзЗ рж╣рзЯред рждрж╛рж░ ржХрзЛржерж╛рзЯ ржЬрж╛ржирж┐ ржпрзЗрждрзЗ ржЗржЪрзНржЫрж╛ 

Translating texts:  97%|█████████▋| 47134/48483 [1:54:46<1:12:22,  3.22s/it]

Translation failed for text: اشتراكية الإسلام بين الطبعة الأولى 1959م والثانية 1960م لكتابه "اشتراكية الإسلام"، ذكر السباعي أن في الطبعة الثانية زيادات. يقول : " امتازت الطبعة الثانية بتحقيقات مهمة، وزيادات كثيرة، وأمثلة عديدة من الواقع التاريخي "." كنت أود أن أتوسع في بحث الواقع التاريخي في الدولة والمجتمع والفرد المسلم، لولا ضيق المجال، كما أنني لم أتحدث عن حركة أبي ذر التي قام بها في عهد عثمان، رضي الله عنهما، لأني لم أستكمل بعد دراسة أسبابها وحقيقتها، وتمحيص النصوص التاريخية الواردة بشأنها، بالشكل الذي أطمئن إليه وأقتنع به. وأيضًا لم أتعرض لبعض الحركات السياسية التي قامت في العصر العباسي، واتخذت شكلاً فوضويًا شيوعيًا، كحركة القرامطة. أرجو أن أضيف هذه الأبحاث كلها، مع التوسع في كثير مما أجملته في هذه الطبعة، إلى الطبعة القادمة بإذن الله "." كنت أرغب ألا يعاد طبع هذا الكتاب للمرة الثانية إلا بعد أن أكون قد انتهيت من كل الأبحاث التي وعدت بتحقيقها في الطبعة الأولى، ولكن الكتاب ما كاد يخرج إلى الأيدي حتى نفدت نسخه بعد أشهر قلائل، ثم ازداد الطلب على ناشريه ازديادًا كبيرًا، مما اضطرني إلى

Translating texts:  97%|█████████▋| 47167/48483 [1:55:30<1:17:47,  3.55s/it]

Translation failed for text: الإسلام وأصول الحكم (1) أن يذهب باحث إلى أن النبي عليه السلام كان رسولاً وملكاً، وليس بدعاً ولا شذوذاً أن يخالف في ذلك مخالف، فذلك بحث خارج عن دائرة العقائد الدينية التي تعارف العلماء بحثها، واستقر لهم فيها مذهب.(2) أنت تعلم أن الرسالة غير الملك، وأنه ليس بينهما شيء من التلازم بوجه من الوجوه، وأن الرسالة مقام والملك مقام آخر، فكم من ملك ليس نبياً ولا رسولاً، وكم لله جل شأنه من رسل لم يكونوا ملوكاً. بل أن أكثر من عرفنا من الرسل إنما كانوا رسلاً فحسب.ولقد كان عيسى بن مريم عليه السلام رسول الدعوة المسيحية، وزعيم المسيحيين، وكان مع هذا يدعو إلى الإذعان لقيصر، ويؤمن بسلطانه. وهو الذي أرسل بين أتباعه تلك الكلمة البالغة "أعطوا ما لقيصر لقيصر، ولله لله".وكان يوسف بن يعقوب عليه السلام، عاملاً من العمال، في دولة فرعون مصر.ولا نعرف في تاريخ الرسل من جمع الله له بين الرسالة والملك، إلا قليلاً.فهل كان محمد صلى الله تعالى عليه وسلم ممن جمع الله له بين الرسالة والملك، أم كان رسولاً غير ملك؟(3) لا نعرف لأحد من العلماء رأياً صريحاً في ذلك البحث ولا نجد من تعرض للكلام فيه، ب

Translating texts:  98%|█████████▊| 47281/48483 [1:57:41<53:31,  2.67s/it]

Translation failed for text: الصوت روح في مجال المسرح و الشعر ، و فى المجال الأدبي عموما معروف .. ان هناك ادب بيتم تطويع المواضيع و الافكار له ..، و ادب بيتم تطويعه و استخدامه لعرض الافكار و المواضيع من خلالهفي المسرح مثلا عندنا تجارب لتوفيق الحكيم و لدكتور مصطفى محمود من مسرحيات هي فنيا قد تكون غير صالحة باعتراف اصحابها ذاتهم .. لكنها على مستوى عرض الموضوع و الفكرة ناجحة طبعا ..،كذلك فى الشعر .. هناك اشعار بيتم تطويع المواضيع و الأفكار لها .. و هناك اشعار بيتم تطويعها و استخدامها لعرض الافكار و المواضيع و الأحداث .. الشئ اللى بيجعل الجانب الفني فيها مجور عليه و ديوان (دول العرب و عظماء الإسلام) لأمير الشعراء أحمد شوقي مثلا هو واحد من هذه الأدبيات التى جار صاحبها على الجانب الفنى فيها ، لصالح المواضيع و الافكار التى طرحها من خلال اشعاره ، و ليس ذاك لمشكلة ما او نقص في حس الشاعر او موهبته و انما ذاك لثقل و تعقيد وكثافة التجربة التي مر بها الشاعر و التي أبت عليه شعريا كفكرة مجملة .. و هو احوج مايكون الى التعبير عنها مجملا مما دفع به الى ذلك الأسلوب لا ابتذالا او تهاونا و لكن رغبة منه في 

Translating texts:  99%|█████████▊| 47827/48483 [2:07:11<38:13,  3.50s/it]

Translation failed for text: علم الإجتماع الآلي قراءة في كتاب "علم الاجتماع الآلي " إن كتاب "علم الاجتماع الآلي "هو عنوان الكتاب الصادر حديثا عن "عالم المعرفة "، وهي سلسلة شهرية يصدرها المجلس الوطني للثقافة والفنون والآداب بالكويت.الكتاب في 255 صفحة، هو عبارة عن مقاربة في علم الاجتماع العربي والاتصال عبر الحاسوب، لمؤلفه الدكتور "علي محمد ميلاد بن رحومة"، وهو من مواليد مارس 1957 بطرابلس بليبيا، اختصاصي نظم الحاسوب وباحث في المعلوماتية والاجتماع الافتراضي. - يعتبر هذا الكتاب عرضا جامعا لعلم جديد يدرس الإنسان في علاقته مع الحاسوب والحاسوب مع الإنسان وكيف يكون التفاعل بين الكيانين واحد طبيعي والآخر صناعي. إن كتاب "علم الاجتماع الآلي "هو عنوان الكتاب الصادر حديثا عن "عالم المعرفة "، وهي سلسلة شهرية يصدرها المجلس الوطني للثقافة والفنون والآداب بالكويت. الكتاب في 255 صفحة، هو عبارة عن مقاربة في علم الاجتماع العربي والاتصال عبر الحاسوب، لمؤلفه الدكتور "علي محمد ميلاد بن رحومة"، وهو من مواليد مارس 1957 بطرابلس بليبيا، اختصاصي نظم الحاسوب وباحث في المعلوماتية والاجتماع الافتراضي، حاز على العديد 

Translating texts: 100%|█████████▉| 48403/48483 [2:16:34<04:11,  3.15s/it]

Translation failed for text: ከአድማስ ባሻገር ከአድማስ ባሻገር
Error: No features in text.


Translating texts: 100%|█████████▉| 48404/48483 [2:16:43<06:05,  4.62s/it]

Translation failed for text: የተቆለፈበት ቁልፍ የታሪኩ ዋነኛ ገጸባህርያትን ተከትሎ በብዙ ሰዎች እና ቤተሰቦች ዙሪያ የሚያጠነጥነው ይህ ታሪክ በኢትዮጵያና በአሜሪካ የተከወነ ሲሆን በውስጡም ፍቅርና-ቅናት፣ ክፋትና-ደግነት፣ ጭካኔና-ርህራሄ፣ ድህነትና-ስኬት ይንጸባረቁበታል፣ ይፈራረቁበታል። ከገጠር እስከ ከተማ፣ ከቤት ቤት ይዞራል፧፧ በታሪኩ ዉስጥ የተቆላለፉ ጉዳዮች እስኪፈቱ አንባቢ ከራሱ አእምሮና አስተሳሰብ ጋር ፊትለፊት ይላተማል፧፧ የሰው ኑሮው፣ የቤተሰብ ደረጃውና የአገር ልኩ ከአስተሳሰብ እንደማይዘልና አስተሳሰብም ሳይቀየር የሰውን ኑሮ የሀገርንም መልክ መቀየር እንደማይቻል ማሳየት ብቻ ሳይሆን መንገድና አቅጣጫንም ይጠቁማል።
Error: No features in text.


Translating texts: 100%|█████████▉| 48405/48483 [2:16:51<07:10,  5.52s/it]

Translation failed for text: ፍቅር እስከ መቃብር ደራሲ ሐዲስ አለማየሁ የኖሩበትን የፊውዳሉን ዘመን ባህል፣ አስተሳሰብ እንዲሁም የአኗኗር ዘይቤ በዘመኑ ያልተፈጠረን አንባቢን ጭምር በምናቡ ዘመኑን እየሳለ ያልኖረበትን ዘመን ጣእም እንዲያጣጥም ያስቻለበት ድንቅ የፍቅር መጽሐፍ ነው። በኢትዮጵያ ዘመናዊ የአማርኛ የልቦለድ ስነጽሑፍ ታሪክ በብዙ ጸሐፍት ቁንጮ ስፍራን የያዘም ነው።
Error: No features in text.


Translating texts: 100%|██████████| 48483/48483 [2:17:52<00:00,  5.86it/s]

done





# This Cells

In [16]:
dataset

Unnamed: 0,book_title,book_desc
0,"""Break the Casanova's Heart"" Operation",Operation Break the Casanova's Heart10 Steps t...
1,"""El Aleph"" de Jorge Luis Borges",De los muy pocos manuscritos que se conservan ...
2,"""Evil"" Arabs in American Popular Film: Orienta...","Runner-up, 2006 Arab American National Museum ..."
3,"""Exterminate All the Brutes"": One Man's Odysse...","""Exterminate All the Brutes"" is a searching ex..."
4,"""Humanism - The Whore of Babylon and the Sleep...","���.., Awake thou that sleepest, and arise fro..."
...,...,...
48478,동물 농장,영국 작가의 세계적인 장편소설. 인간에게 착취 당하던 동물들이 인간을 내쫓고 동물농...
48479,신의 탑 1,What do you desire? Fortune? Glory? Power? Rev...
48480,오만과 편견,"Korean translation of ""Pride and Prejudice"""
48481,초조한 마음,인간 본성에 대한 분석이 돋보이는 츠바이크의 유일한 장편!세계적인 전기 작가이자 심...


In [23]:
# dataset.to_csv('translated_dataset.csv', index=False)

In [27]:
dataset = pd.read_csv('translated_dataset.csv')
nontranslated = pd.read_csv('books_data/books.csv')

In [28]:
dataset.shape

(48483, 2)

In [29]:
nontranslated.shape

(54301, 12)

In [30]:
nontranslated.drop(columns=['book_format', 'book_authors', 'book_edition', 'book_isbn','book_pages','book_rating','book_rating_count','book_review_count','genres','image_url'], inplace=True)
nontranslated['book_desc'].fillna(nontranslated['book_title'], inplace=True)
nontranslated = nontranslated.drop_duplicates(subset='book_title', keep='first').reset_index(drop=True)
nontranslated = nontranslated.groupby('book_title', as_index=False).agg({
    'book_desc': ' '.join  # Combine blurbs
})
nontranslated.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  nontranslated['book_desc'].fillna(nontranslated['book_title'], inplace=True)


(48483, 2)

In [31]:
dataset['book_title'] = nontranslated['book_title']
dataset['book_original_desc'] = nontranslated['book_desc']

dataset

Unnamed: 0,book_title,book_desc,book_original_desc
0,"""Break the Casanova's Heart"" Operation","the Casanova's Heart"" Operation Operation Brea...",Operation Break the Casanova's Heart10 Steps t...
1,"""El Aleph"" de Jorge Luis Borges","Aleph"" by Jorge Luis Borges of the very few ma...",De los muy pocos manuscritos que se conservan ...
2,"""Evil"" Arabs in American Popular Film: Orienta...",Arabs in American Popular Film: Orientalist Fe...,"Runner-up, 2006 Arab American National Museum ..."
3,"""Exterminate All the Brutes"": One Man's Odysse...","All the Brutes"": One Man's Odyssey into the He...","""Exterminate All the Brutes"" is a searching ex..."
4,"""Humanism - The Whore of Babylon and the Sleep...",- The Whore of Babylon and the Sleeping Church...,"���.., Awake thou that sleepest, and arise fro..."
...,...,...,...
48478,동물 농장,Farm British writer's world -class novel.A fea...,영국 작가의 세계적인 장편소설. 인간에게 착취 당하던 동물들이 인간을 내쫓고 동물농...
48479,신의 탑 1,탑 1 What do you desire? Fortune? Glory? Power?...,What do you desire? Fortune? Glory? Power? Rev...
48480,오만과 편견,"편견 Korean translation of ""Pride and Prejudice""","Korean translation of ""Pride and Prejudice"""
48481,초조한 마음,"only feature of Tsubiki, which shows an analys...",인간 본성에 대한 분석이 돋보이는 츠바이크의 유일한 장편!세계적인 전기 작가이자 심...


In [32]:
dataset.shape

(48483, 3)

In [33]:
def improve_incomplete_or_short_texts(dataset, text_column, title_column, min_length=20):
    """
    Improves rows with incomplete, short, or gibberish text by augmenting with title or inferred data.

    Parameters:
    - dataset (pd.DataFrame): The input DataFrame.
    - text_column (str): Column name containing text to improve.
    - title_column (str): Column name containing the book title.
    - min_length (int): Minimum length of text to improve.

    Returns:
    - pd.DataFrame: DataFrame with improved text.
    """
    def improve_text(row):
        text = row[text_column]
        title = row[title_column]

        # Handle NaN or missing text
        if pd.isna(text):
            return title if title else "No description available"

        # If text is too short, augment it with the title
        if len(text) < min_length:
            text = f"{title}. {text}" if title else text

        # Handle other cases like gibberish or incomplete patterns
        if re.search(r'\.{3,}', text):  # If text has ellipses
            text = re.sub(r'\.{3,}', ".", text)  # Replace ellipses with a period

        if re.search(r'[^a-zA-Z0-9\s.,!?\'"-]', text):  # If non-alphanumeric content exists
            text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', "", text)  # Remove invalid characters

        return text

    # Apply the improvement to the dataset
    dataset[text_column] = dataset.apply(improve_text, axis=1)
    return dataset


In [34]:
dataset = improve_incomplete_or_short_texts(
    dataset,
    text_column='book_desc',
    title_column='book_title',
    min_length=20
)

In [35]:
dataset.shape

(48483, 3)

In [36]:
dataset

Unnamed: 0,book_title,book_desc,book_original_desc
0,"""Break the Casanova's Heart"" Operation","the Casanova's Heart"" Operation Operation Brea...",Operation Break the Casanova's Heart10 Steps t...
1,"""El Aleph"" de Jorge Luis Borges","Aleph"" by Jorge Luis Borges of the very few ma...",De los muy pocos manuscritos que se conservan ...
2,"""Evil"" Arabs in American Popular Film: Orienta...",Arabs in American Popular Film Orientalist Fea...,"Runner-up, 2006 Arab American National Museum ..."
3,"""Exterminate All the Brutes"": One Man's Odysse...","All the Brutes"" One Man's Odyssey into the Hea...","""Exterminate All the Brutes"" is a searching ex..."
4,"""Humanism - The Whore of Babylon and the Sleep...",- The Whore of Babylon and the Sleeping Church...,"���.., Awake thou that sleepest, and arise fro..."
...,...,...,...
48478,동물 농장,Farm British writer's world -class novel.A fea...,영국 작가의 세계적인 장편소설. 인간에게 착취 당하던 동물들이 인간을 내쫓고 동물농...
48479,신의 탑 1,1 What do you desire? Fortune? Glory? Power? ...,What do you desire? Fortune? Glory? Power? Rev...
48480,오만과 편견,"Korean translation of ""Pride and Prejudice""","Korean translation of ""Pride and Prejudice"""
48481,초조한 마음,"only feature of Tsubiki, which shows an analys...",인간 본성에 대한 분석이 돋보이는 츠바이크의 유일한 장편!세계적인 전기 작가이자 심...


In [37]:
dataset['Cleaned_Blurb'] = dataset['book_desc'].apply(preprocess_text)

In [38]:
dataset['book_original_title'] = dataset['book_title']
dataset['book_title'] = dataset['book_title'].apply(preprocess_text)

In [39]:
dataset.head()

Unnamed: 0,book_title,book_desc,book_original_desc,Cleaned_Blurb,book_original_title
0,break casanova heart operation,"the Casanova's Heart"" Operation Operation Brea...",Operation Break the Casanova's Heart10 Steps t...,casanova heart operation operation break casan...,"""Break the Casanova's Heart"" Operation"
1,el aleph de jorge luis borges,"Aleph"" by Jorge Luis Borges of the very few ma...",De los muy pocos manuscritos que se conservan ...,aleph jorge luis borges manuscript preserved j...,"""El Aleph"" de Jorge Luis Borges"
2,evil arab american popular film orientalist fear,Arabs in American Popular Film Orientalist Fea...,"Runner-up, 2006 Arab American National Museum ...",arab american popular film orientalist fear ru...,"""Evil"" Arabs in American Popular Film: Orienta..."
3,exterminate brute one man odyssey heart darkne...,"All the Brutes"" One Man's Odyssey into the Hea...","""Exterminate All the Brutes"" is a searching ex...",brute one man odyssey heart darkness origin eu...,"""Exterminate All the Brutes"": One Man's Odysse..."
4,humanism whore babylon sleeping church,- The Whore of Babylon and the Sleeping Church...,"���.., Awake thou that sleepest, and arise fro...",whore babylon sleeping church awake thou sleep...,"""Humanism - The Whore of Babylon and the Sleep..."


In [40]:
dataset.head()

Unnamed: 0,book_title,book_desc,book_original_desc,Cleaned_Blurb,book_original_title
0,break casanova heart operation,"the Casanova's Heart"" Operation Operation Brea...",Operation Break the Casanova's Heart10 Steps t...,casanova heart operation operation break casan...,"""Break the Casanova's Heart"" Operation"
1,el aleph de jorge luis borges,"Aleph"" by Jorge Luis Borges of the very few ma...",De los muy pocos manuscritos que se conservan ...,aleph jorge luis borges manuscript preserved j...,"""El Aleph"" de Jorge Luis Borges"
2,evil arab american popular film orientalist fear,Arabs in American Popular Film Orientalist Fea...,"Runner-up, 2006 Arab American National Museum ...",arab american popular film orientalist fear ru...,"""Evil"" Arabs in American Popular Film: Orienta..."
3,exterminate brute one man odyssey heart darkne...,"All the Brutes"" One Man's Odyssey into the Hea...","""Exterminate All the Brutes"" is a searching ex...",brute one man odyssey heart darkness origin eu...,"""Exterminate All the Brutes"": One Man's Odysse..."
4,humanism whore babylon sleeping church,- The Whore of Babylon and the Sleeping Church...,"���.., Awake thou that sleepest, and arise fro...",whore babylon sleeping church awake thou sleep...,"""Humanism - The Whore of Babylon and the Sleep..."


In [41]:
dataset.to_csv('td.csv')

In [42]:
dataset.shape

(48483, 5)

In [32]:
dataset

Unnamed: 0,book_title,book_desc,Cleaned_Blurb,book_original_title
0,break casanova heart operation,"the Casanova's Heart"" Operation Operation Brea...",casanova heart operation operation break casan...,"""Break the Casanova's Heart"" Operation"
1,el aleph de jorge luis borges,"Aleph"" by Jorge Luis Borges of the very few ma...",aleph jorge luis borges manuscript preserved j...,"""El Aleph"" de Jorge Luis Borges"
2,evil arab american popular film orientalist fear,Arabs in American Popular Film Orientalist Fea...,arab american popular film orientalist fear ru...,"""Evil"" Arabs in American Popular Film: Orienta..."
3,exterminate brute one man odyssey heart darkne...,"All the Brutes"" One Man's Odyssey into the Hea...",brute one man odyssey heart darkness origin eu...,"""Exterminate All the Brutes"": One Man's Odysse..."
4,humanism whore babylon sleeping church,- The Whore of Babylon and the Sleeping Church...,whore babylon sleeping church awake thou sleep...,"""Humanism - The Whore of Babylon and the Sleep..."
...,...,...,...,...
48478,동물 농장,Farm British writer's world -class novel.A fea...,farm british writer world class novel feature ...,동물 농장
48479,신의 탑 1,1 What do you desire? Fortune? Glory? Power? ...,1 desire fortune glory power revenge something...,신의 탑 1
48480,오만과 편견,"Korean translation of ""Pride and Prejudice""",korean translation pride prejudice,오만과 편견
48481,초조한 마음,"only feature of Tsubiki, which shows an analys...",feature tsubiki show analysis nervous heart hu...,초조한 마음


In [33]:
dataset.shape

(48483, 4)

In [34]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.9,
    ngram_range=(1, 3),
    sublinear_tf=True
)

tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Cleaned_Blurb'])

# Create the recommend Function

In [35]:
def recommend_books_by_blurb(user_blurb, top_n=5):
    """
    Recommends books based on the similarity of the user blurb to the dataset.

    Parameters:
    - user_blurb (str): The input blurb provided by the user.
    - top_n (int): Number of recommendations to return.

    Returns:
    - list: A list of recommended books with their titles and descriptions.
    """
    # Preprocess the user blurb
    preprocessed_blurb = preprocess_text(user_blurb)
    if not preprocessed_blurb.strip():
        return "The input blurb is empty after preprocessing. Please provide a valid input."

    # Vectorize the user's blurb
    user_blurb_vector = tfidf_vectorizer.transform([preprocessed_blurb])
    similarity_scores = cosine_similarity(user_blurb_vector, tfidf_matrix).flatten()

    # Find top N similar books
    similar_indices = similarity_scores.argsort()[-top_n:][::-1]

    if similar_indices.size == 0:
        return "No similar books found."

    return dataset.iloc[similar_indices][['book_original_title', 'book_desc']].to_dict(orient='records')


In [36]:
user_blurb = "A young wizard discovers his magical heritage on his 11th birthday and attends a magical school.And his name is Harry Potter."
print("Recommended Books:")
recommendations = recommend_books_by_blurb(user_blurb)

if isinstance(recommendations, str):
    print(recommendations)  # Error or no recommendations
else:
    for rec in recommendations:
        print(f"Title: {rec['book_original_title']}\nBlurb: {rec['book_desc']}\n")


Recommended Books:
Title: Harry Potter and the Order of the Phoenix (Harry Potter, #5, Part 1)
Blurb: Potter and the Order of the Phoenix Harry Potter, 5, Part 1 Harry Potter and the Order of the Phoenix Harry Potter, 5, Part 1

Title: Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)
Blurb: Potter Boxed Set, Books 1-5 Harry Potter, 1-5 Box Set containing Harry Potter and the Sorcerer's Stone, Harry Potter and the Chamber Of Secrets, Harry Potter and the Prisoner of Azkaban, Harry Potter and the Goblet Of Fire, and Harry Potter and the Order of the Phoenix!

Title: The Harry Potter trilogy
Blurb: Harry Potter trilogy This box set collects hard cover editions Harry Potter and the Philosopher's Stone, Harry Potter and the Chamber of Secrets, and Harry Potter and the Prisoner of Azkaban in a slip case.

Title: Harry Potter et les Reliques de la Mort
Blurb: Potter and the death relics and here is the seventh and ultimate volume of the heroic story of Harry Potter.

Title: Harry Potter

In [37]:
dataset[dataset['book_original_title'].str.contains('Harry Potter', case=False, na=False)]

Unnamed: 0,book_title,book_desc,Cleaned_Blurb,book_original_title
2835,unofficial muggle guide wizarding world explor...,Unofficial Muggle's Guide to the Wizarding Wor...,unofficial muggle guide wizarding world explor...,An Unofficial Muggle's Guide to the Wizarding ...
11730,fact fiction folklore harry potter world unoff...,"Fiction, and Folklore in Harry Potter's World ...",fiction folklore harry potter world unofficial...,"Fact, Fiction, and Folklore in Harry Potter's ..."
14518,harry potter boxed set book 1 5 harry potter 1 5,"Potter Boxed Set, Books 1-5 Harry Potter, 1-5 ...",potter boxed set book 1 5 harry potter 1 5 box...,"Harry Potter Boxed Set, Books 1-5 (Harry Potte..."
14519,harry potter boxset,"Potter Boxset Now for the first time ever, J.K...",potter boxset first time ever j k rowlings sev...,Harry Potter Boxset
14520,harry potter collection,"Potter Collection Six years of magic, adventur...",potter collection six year magic adventure mys...,Harry Potter Collection
14521,harry potter schoolbook box set two classic bo...,Potter Schoolbooks Box Set Two Classic Books f...,potter schoolbook box set two classic book lib...,Harry Potter Schoolbooks Box Set: Two Classic ...
14522,harry potter series box set,Potter Series Box Set Over 4000 Pages of Harry...,potter series box set 4000 page harry potter w...,Harry Potter Series Box Set
14523,harry potter a tajemná komnata,Potter and the Mysterious Chamber Welcome a se...,potter mysterious chamber welcome second time ...,Harry Potter a Tajemná komnata
14524,harry potter chamber secret,Potter and the Chamber of Secrets The Dursleys...,potter chamber secret dursleys mean hideous su...,Harry Potter and the Chamber of Secrets
14525,harry potter cursed child part ii,Potter and the Cursed Child - Parts I II Base...,potter cursed child part ii based original new...,Harry Potter and the Cursed Child - Parts I & II


# Try Transformers

In [38]:
from sentence_transformers import SentenceTransformer, util

# Load a pretrained transformer model (e.g., multilingual model for multi-language support)
model = SentenceTransformer('all-mpnet-base-v2')  # Replace with a multilingual model like 'paraphrase-multilingual-mpnet-base-v2'
print('imported')
# Encode the blurbs in your dataset
dataset['Embedding'] = dataset['Cleaned_Blurb'].apply(lambda x: model.encode(x, convert_to_tensor=True))


  from .autonotebook import tqdm as notebook_tqdm



imported


In [43]:
def recommend_books_transformer(user_blurb, top_n=5):
    user_embedding = model.encode(user_blurb, convert_to_tensor=True)
    results = []
    for idx, row in dataset.iterrows():
        score = util.pytorch_cos_sim(user_embedding, row['Embedding'])[0][0].item()
        results.append((idx, score))
    results = sorted(results, key=lambda x: x[1], reverse=True)[:top_n]
    return [
        {
            'book_original_title': dataset.iloc[idx]['book_original_title'],
            'book_desc': dataset.iloc[idx]['book_desc'],
            'Similarity': score,
        }
        for idx, score in results
    ]

In [45]:
user_blurb = "A young wizard discovers his magical heritage on his 11th birthday and attends a magical school.And his name is Harry Potter."
print("Recommended Books:")
recommendations = recommend_books_transformer(user_blurb)

if isinstance(recommendations, str):
    print(recommendations)  # Error or no recommendations
else:
    for rec in recommendations:
        print(f"Title: {rec['book_original_title']}\nBlurb: {rec['book_desc']}\n")


Recommended Books:
Title: Magic Beginnings (Magical Girls Academy, #1)
Blurb: Beginnings Magical Girls Academy, 1 Thousands of years in the future, teens with magical powers go to a special and wonderful boarding school to learn how to fight evil. Battle monsters, fall in love, and create your own unique reading experience!

Title: Harry Potter a Tajemná komnata
Blurb: Potter and the Mysterious Chamber Welcome a second time at Hogwarts School of Witches and Magic.Harry returns to school after the holidays and has no idea that this time he is in danger of fatal danger.In Hogwarts, strange things start to happen in which blood in the veins solidifies, and none of the pupils know that the ancient legend has become a reality.

Title: See You at Harry's
Blurb: You at Harry's Starting middle school brings all the usual challenges  until the unthinkable happens, and Fern and her family must find a way to heal.Twelve-year-old Fern feels invisible. It seems as though everyone in her family has 