In [1]:
import requests
from bs4 import BeautifulSoup
import random
from tqdm import tqdm
import pandas as pd
import re
import time

In [2]:
books={'Title':[],
      'Author':[],
       'Year Published':[],
       'Number of Logs':[]}


In [3]:
for i in range(1,11):
    time.sleep(1)
    url=f'https://openlibrary.org/trending/forever?page={i}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    trends = soup.find_all('div', class_="sri__main")
    for book in trends:
        Title = book.find('div', class_='resultTitle').text.strip()
        Author=book.find('span', class_='bookauthor').text.strip()
        detailed=book.find('span', class_='resultStats')
        pubdate=detailed.find('span', class_='resultDetails').text.strip()
        pubdate_cleaned = re.sub(r"(—|\d+ editions|\s{2,})", "", pubdate).strip()
        alldetails=book.find('div', class_='details')
        logged_text = None
        match = re.search(r'Logged (\d+) times', book.text)  # Search within the current book
        if match:
            logged_text = match.group(0) 
        print(Title)
        print(Author)
        print(pubdate_cleaned)
        print(logged_text)
        books['Title'].append(Title)
        books['Author'].append(Author)
        books['Year Published'].append(pubdate_cleaned)
        books['Number of Logs'].append(logged_text)

Atomic Habits
by James Clear
First published in 2016
Logged 41181 times
It Ends With Us
by Colleen Hoover
First published in 2012
Logged 40568 times
The 48 Laws of Power
by Robert Greene and Joost Elffers
First published in 1998
Logged 34986 times
The Subtle Art of Not Giving a F*ck
by Mark Manson
First published in 2016
Logged 30182 times
Um casamento arranjado
by Zana Kheiron
First published in 2019
Logged 23272 times
Rich Dad, Poor Dad
by Robert T. Kiyosaki and Sharon L. Lechter
First published in 1990
Logged 23002 times
Harry Potter and the Philosopher's Stone
by J. K. Rowling
First published in 1997
Logged 15987 times
It Starts with Us
by Colleen Hoover
First published in 2022
Logged 15547 times
Control Your Mind and Master Your Feelings
by Eric Robertson - undifferentiated
First published in 2019
Logged 15511 times
Think and Grow Rich
by Napoleon Hill
First published in 1937
Logged 11716 times
Latidos Que No Dije
by Roos
First published in 2022
Logged 9778 times
How to Win Friend

Misery
by Stephen King
First published in 1978
Logged 2256 times
Hunting Adeline
by H. D. Carlton
First published in 2022
Logged 2248 times
Pet Sematary
by Stephen King
First published in 1925
Logged 2219 times
Romeo and Juliet
by William Shakespeare
First published in 1597
Logged 2202 times
Little Women
by Louisa May Alcott
First published in 1848
Logged 2196 times
The Catcher in the Rye
by J. D. Salinger
First published in 1900
Logged 2187 times
The Summer I Turned Pretty Trilogy
by Jenny Han
First published in 2009
Logged 2177 times
The Book Thief
by Markus Zusak
First published in 1998
Logged 2143 times
Carrie
by Stephen King
First published in 1974
Logged 2132 times
Wuthering Heights
by Emily Brontë
First published in 1847
Logged 2132 times
They Both Die at the End
by Adam Silvera
First published in 2017
Logged 2114 times
A Brief History of Time
by Stephen Hawking
First published in 1988
Logged 2111 times
The Perks of Being a Wallflower
by Stephen Chbosky and Stephen Chbosky
Firs

In [4]:
df = pd.DataFrame(books)
df['Year Published'] = df['Year Published'].str.replace(r'\D', '', regex=True).astype(int)
df['Number of Logs'] = df['Number of Logs'].str.replace(r'\D', '', regex=True).astype(int)

In [5]:
df.head()

Unnamed: 0,Title,Author,Year Published,Number of Logs
0,Atomic Habits,by James Clear,2016,41181
1,It Ends With Us,by Colleen Hoover,2012,40568
2,The 48 Laws of Power,by Robert Greene and Joost Elffers,1998,34986
3,The Subtle Art of Not Giving a F*ck,by Mark Manson,2016,30182
4,Um casamento arranjado,by Zana Kheiron,2019,23272


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           199 non-null    object
 1   Author          199 non-null    object
 2   Year Published  199 non-null    int32 
 3   Number of Logs  199 non-null    int32 
dtypes: int32(2), object(2)
memory usage: 4.8+ KB


In [7]:
df.tail()

Unnamed: 0,Title,Author,Year Published,Number of Logs
194,The Titan's Curse,by Rick Riordan,2007,1300
195,Cien años de soledad,by Gabriel García Márquez,1967,1298
196,A Wrinkle in Time,by Madeleine L'Engle,1962,1296
197,"A child called ""it""",by David J. Pelzer,1987,1296
198,"I, Robot",by Isaac Asimov,1950,1280


In [17]:
probability=(1/len(df))
print(f'Probability is: {probability}')
expectedlogs=0
for log in df['Number of Logs']:
    TMPexpLog=probability*log
    expectedlogs+=TMPexpLog
variance = (probability * (df['Number of Logs'] - expectedlogs) ** 2).sum()
StandardDeviation=variance ** 0.5
print(f'Variance: {variance}')
print(f'Expected value: {expectedlogs}')
print(f'Standard Deviation: {StandardDeviation}')


Probability is: 0.005025125628140704
Variance: 32196316.943057
Expected value: 3870.356783919598
Standard Deviation: 5674.179847612957
