In [None]:
from googlesearch import search
import pandas as pd
import numpy as np
import glob
import json
from requests import get
import os
import bs4
from datetime import datetime
from time import sleep
import traceback
import string
from unidecode import unidecode
import re

import jellyfish

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
def scrape_book(book_id):
    def get_series_url(soup):
        series_elem = soup.find('h3', {'class': 'Text Text__title3 Text__italic Text__regular Text__subdued'})
        if series_elem:
            series_slug = series_elem.find('a')['href'].split('/')[-1]
            return series_slug
        else:
            print('    Series link not found')


    def get_title(soup):
        title_elem = soup.find('h1', {'class': 'Text Text__title1'})
        if title_elem:
            title = ' '.join(title_elem.text.split())
            if title:
                return title
            else:
                print('    Title not found')

    def get_genres(soup):
        genre_elems = soup.find_all('span', {'class': 'BookPageMetadataSection__genreButton'})
        if genre_elems:
            genres = []
            for span in genre_elems:
                genre = span.find('span', {'class': 'Button__labelItem'}).text
                if genre != '...more':
                    genres.append(genre)
            return genres
        else:
            print('    Genres not found')

    def get_script_data(soup):
        try:
            script_tag = soup.find('script', {'type': 'application/ld+json'})
            data = json.loads(script_tag.string)
            ratingCount = data.get('aggregateRating', {}).get('ratingCount', np.nan)
            reviewCount = data.get('aggregateRating', {}).get('reviewCount', np.nan)
            ratingValue = data.get('aggregateRating', {}).get('ratingValue', np.nan)
        except AttributeError:
            print(f"    Script tag not found")
            ratingCount = np.nan
            reviewCount = np.nan
            ratingValue = np.nan

        return {'num_ratings':          ratingCount,
                'num_reviews':          reviewCount,
                'average_rating':       ratingValue}

    def get_rating_distribution(soup):
        rating_bar_elems = soup.find_all('div', {'class': 'RatingsHistogram__bar'})
        if rating_bar_elems:
            distribution_dict = {}
            for bar in rating_bar_elems:
                star_label = bar['aria-label']
                num_ratings = bar.find('div', {'class': 'RatingsHistogram__labelTotal'}).text.split(' ')[0]
                num_ratings = int(num_ratings.replace(',', ''))
                distribution_dict[star_label] = num_ratings
            return distribution_dict
        else:
            print('    No rating bars found')

    def get_num_pages(soup):
        pages_elem = soup.find('p', {'data-testid': 'pagesFormat'})
        if pages_elem:
            try:
                num_pages = int(pages_elem.text.split(' ')[0])
            except Exception:
                print('    No page number found')
                num_pages = np.nan
        else:
            print('    No page number found')
            num_pages = np.nan
        return num_pages
    
    def get_author(soup):
        author_elem = soup.find('span', {'class': 'ContributorLink__name'})
        if author_elem:
            author = author_elem.text.strip()
        else:
            print('    No author found')
            author = np.nan
        return author
    
    def get_year(soup):
        year_elem = soup.find('p', {'data-testid': 'publicationInfo'})
        if year_elem:
            year = year_elem.text.split(',')[-1]
        else:
            print('    Year not found')
            year = np.nan
        return year

    
    url = f'https://www.goodreads.com/book/show/{book_id}'
    source = get(url, timeout=30).text
    soup = bs4.BeautifulSoup(source, 'html.parser')

    return {**{'book_id': book_id,
            "series": get_series_url(soup),
            'title': get_title(soup),
            'genres': get_genres(soup),
            'author': get_author(soup),
            'num_pages': get_num_pages(soup),
            'year': get_year(soup)},
            **get_script_data(soup),
            **get_rating_distribution(soup)}


def condense_books(books_directory_path):
    books = []
    for file_name in os.listdir(books_directory_path):
        if f'_metadata.json' in file_name:
            book = json.load(open(f'metadata/{file_name}', 'r'))
            books.append(book)
    return books


def delete_metadata():
    directory = './metadata/*metadata.json'
    files = glob.glob(directory)
    for f in files:
        os.remove(f)


In [None]:
df = pd.read_csv('data/blinkist_scraped.csv')
df

In [None]:
list = os.listdir('/Users/alex/Downloads/Blinkist August 2023 SiteRip Collection - BASiQ')
list = [name.split('.m4a')[0] for name in list]
dir_df = pd.DataFrame([name.split(' - ') for name in list], columns=['authors', 'titles'])
dir_df['text_id'] = dir_df['authors'] + ' - ' + dir_df['titles']

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_accents(text):
    return unidecode(text)

def remove_parentheses(text):
    return re.sub(r'\(.*?\)', '', text)

In [None]:
df['query'] = df['titles2'] + ' by ' + df['authors2'] + ' goodreads.com book'
jaro_threshold = 0.8

toggle = True
while True:
    scrape_count = 0
    toggle = not toggle
    try:
        rest = df[df['book_id'] == 0]
        if len(rest) == 0:
            break
        counter = 1
        for idx, row in rest.sort_values(by="authors", ascending=toggle).iterrows():
            query = row['query']
            print(f"Searching for: {query} ({counter}/{len(rest)})")
            counter += 1
            start = datetime.now()
            for result in search(query, sleep_interval=5, advanced=True, num_results=50):
                print(f'    {result.url}')
                if result.url.startswith('https://www.goodreads.com/') and '/book/' in result.url:
                    title = result.title
                    title = remove_punctuation(remove_accents(title.lower().strip()))
                    file = row['titles'] + ' ' + row['authors']
                    file = remove_punctuation(remove_accents(file.lower().strip()))
                    jaro = jellyfish.jaro_similarity(file, title)
                    print(f'    "{title}", "{file}": {jaro}')
                    
                    if jellyfish.jaro_similarity(file, title) > jaro_threshold:
                        try:
                            book_id = result.url.split('/')[-1]
                            book_id = book_id.split('-')[0]
                            book_id = book_id.split('.')[0]

                            book = scrape_book(book_id)
                            if book:
                                json.dump(book, open(f'metadata/{book_id}_metadata.json', 'w'))
                                df.at[idx, 'book_id'] = int(book_id)
                                scrape_count += 1
                            else:
                                print(f'    scrape_book() returned empty')
                            break
                        except Exception as e:
                            print(e)
                            traceback.print_exc()
            # sleep(1)
            print(f'    {datetime.now() - start}')
    except Exception as e:
        print(e)
        pass
    df.to_csv('data/blinkist.csv', index=False, encoding='utf-8')

    if scrape_count == 0 and toggle == True:
        break

In [None]:
df.to_csv('data/blinkist.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('data/blinkist.csv')
books = condense_books('metadata')
book_df = pd.DataFrame(books)
book_df.book_id = book_df.book_id.astype(int)

blinkist = df.merge(book_df, on='book_id', how='left')
blinkist

In [None]:
for col in ['author', 'authors', 'title', 'titles']:
    blinkist[f'{col}2'] = blinkist[col].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)

blinkist['title2'] = blinkist['title2'].apply(lambda x: x.split(':')[0] if isinstance(x, str) else x)

for col in ['title2', 'titles2']:
    blinkist[col] = blinkist[col].apply(lambda x: remove_parentheses(x) if isinstance(x, str) else x)
    blinkist[col] = blinkist[col].apply(lambda x: x.lower().strip() if isinstance(x, str) else x)


In [None]:
def split_authors(df, delimiters):
    new_rows = []

    for _, row in df.iterrows():
        authors = row['authors2']
        
        if isinstance(authors, str):
            for delimiter in delimiters:
                if delimiter in authors:
                    authors_split = authors.split(delimiter)
                    for author in authors_split:
                        new_row = row.copy()
                        new_row['authors2'] = author.strip()  # Remove any leading/trailing spaces
                        new_rows.append(new_row)
                    break  # Exit the loop after processing the first valid delimiter
            else:
                new_rows.append(row)
        else:
            new_rows.append(row)

    return pd.DataFrame(new_rows)

delimiters = [' and ', ' with ', ' & ']
for i in delimiters:
    blinkist = split_authors(blinkist, delimiters)
    for col in ['author2', 'authors2', 'title2', 'titles2']:
        blinkist[col] = blinkist[col].apply(lambda x: remove_punctuation(remove_accents(x.lower().strip())) if isinstance(x, str) else x)
        blinkist[col] = blinkist[col].apply(lambda x: ''.join(x.split(' ')) if isinstance(x, str) else x)


In [None]:
def jaro_similarity(str1, str2):
    if pd.isna(str1) or pd.isna(str2):
        return np.nan
        
    # t = jellyfish.levenshtein_distance(str1, str2)
    # m = len(set(str1).intersection(set(str2)))
    # if m == 0:
    #     return 0.0
    # else:
    #     jaro = ((m/len(str1)) + (m/len(str2)) + ((m-t)/m))/3
    #     return jaro
    return jellyfish.jaro_similarity(str1, str2)

blinkist['author_jaro'] = blinkist.apply(lambda row: jaro_similarity(row['author2'], row['authors2']), axis=1)
blinkist['title_jaro'] = blinkist.apply(lambda row: jaro_similarity(row['title2'], row['titles2']), axis=1)
blinkist['jaro'] = blinkist['author_jaro'] * blinkist['title_jaro']
blinkist['jaro'] = blinkist['jaro'].fillna(0.0)
blinkist['text_id'] = blinkist['authors'] + ' - ' + blinkist['titles']
blinkist = dir_df.drop(['authors', 'titles'], axis=1).merge(blinkist, on='text_id', how='left')
blinkist = blinkist.loc[blinkist.groupby('text_id')['jaro'].idxmax()]
blinkist['book_id'] = blinkist['book_id'].astype(int)

In [None]:
filtered_df = blinkist[~((blinkist['author2'] == blinkist['authors2']) & (blinkist['title2'] == blinkist['titles2']))][['book_id','authors', 'author', 'titles', 'title', 'authors2', 'author2', 'author_jaro', 'titles2', 'title2', 'title_jaro', 'jaro']]
filtered_df[filtered_df['book_id'] != 0].sort_values(by='jaro').head(50)

In [None]:
filtered_df = blinkist[~((blinkist['author2'] == blinkist['authors2']) & (blinkist['title2'] == blinkist['titles2']))][['book_id','authors', 'author', 'titles', 'title', 'authors2', 'author2', 'author_jaro', 'titles2', 'title2', 'title_jaro', 'jaro']]
filtered_df = filtered_df[filtered_df['jaro'] <= .4]
filtered_df.sort_values(by='jaro', ascending=False)#.shape
blinkist.loc[filtered_df.index, 'book_id'] = 0

In [None]:
df = blinkist[['book_id', 'authors', 'authors2', 'titles', 'titles2']].copy()
df = df.drop_duplicates()
df

In [None]:
blinkist['num_ratings'] = blinkist['num_ratings'].fillna(0).round().astype(int)
blinkist['num_reviews'] = blinkist['num_reviews'].fillna(0).round().astype(int)
blinkist['num_pages'] = blinkist['num_pages'].fillna(blinkist['num_pages'].mean()).round().astype(int)
blinkist['average_rating'] = ((blinkist['5 stars'] * 5) + (blinkist['4 stars'] * 4) + (blinkist['3 stars'] * 3) + (blinkist['2 stars'] * 2) + blinkist['1 star']) / blinkist['num_ratings']

blinkist.isna().mean()
blinkist.to_csv('data/blinkist_scraped.csv', index=False)
df = blinkist.copy()

In [None]:
df = pd.read_csv('data/blinkist_scraped.csv')

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']

df['5_star_percentage'] = df['5_star_percentage'].fillna(0)
df['4_star_percentage'] = df['4_star_percentage'].fillna(0)
df['3_star_percentage'] = df['3_star_percentage'].fillna(0)
df['2_star_percentage'] = df['2_star_percentage'].fillna(0)
df['1_star_percentage'] = df['1_star_percentage'].fillna(0)

df.average_rating = df.average_rating.fillna(df.average_rating.mean())
df.isna().mean()


In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([2*a, b, c])

coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['2a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_ratings_ln'] = df['num_ratings_ln'].replace(0,1)
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['2a'] - df['2a'].min()

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['page_adjusted_rating'] = df['num_adjusted_rating'] / df['num_pages_ln']
df['2nd_derivative_rating'] = df['num_adjusted_rating'] * df['2a_shifted']
df['2nd_derivative_page_adjusted_rating'] = df['num_adjusted_rating'] * df['2a_shifted'] / df['num_pages_ln']
df

In [None]:
df['rating2'] = df['page_adjusted_rating'] + df['2nd_derivative_rating']
df = df.sort_values(by='rating2', ascending=False).drop('index', axis=1).reset_index()

In [None]:
df = pd.read_csv('data/blinkist_scraped.csv')

In [None]:
df[df.rating2 > 1.7825086940916708]

In [None]:
df#.head(50)

In [None]:

directory = '/Users/alex/Downloads/Blinkist August 2023 SiteRip Collection - BASiQ'

for filename in os.listdir(directory):
    text_id = '_'.join(filename.split('_')[1:]).split('.m4a')[0]
    
    row = df[df['text_id'] == text_id]
    
    if not row.empty:
        old_file = os.path.join(directory, filename)
        
        new_file = os.path.join(directory, f"{row['index'].values[0]}_{text_id}.m4a")
        
        os.rename(old_file, new_file)

print("Files have been renamed successfully!")



In [None]:
list = os.listdir('/Users/alex/Downloads/Blinkist August 2023 SiteRip Collection - BASiQ')
list