# Step 1: Web Scraping New_York_Times_Bestseller_Books (Source: Wikipedia)

## 0. Libraries

In [14]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from time import sleep
from random import uniform
import glob

## 1. Web Scraping 
- web scraping the table on the website

1.1. Try one page

In [15]:
page = requests.get("https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_1998").text
df = pd.read_html(page, flavor="bs4")[0]
df['Year']=np.nan
df['Year'] = df['Year'].fillna("1998")
df
#df.to_csv("nyt.csv", index=False)
#df

Unnamed: 0,Date,Book,Author,Year
0,January 4,Cold Mountain,Charles Frazier,1998
1,January 11,Cold Mountain,Charles Frazier,1998
2,January 18,Cold Mountain,Charles Frazier,1998
3,January 25,Cold Mountain,Charles Frazier,1998
4,February 1,Paradise,Toni Morrison,1998
5,February 8,Paradise,Toni Morrison,1998
6,February 15,Paradise,Toni Morrison,1998
7,February 22,The Street Lawyer,John Grisham,1998
8,March 1,The Street Lawyer,John Grisham,1998
9,March 8,The Street Lawyer,John Grisham,1998


1.2. Web Scraping all bestseller list - genre: fiction (New York Times bestseller - years 1931 to 2021

In [16]:
iterations_fiction = range(1931, 2022)

In [17]:
for i in iterations_fiction:
    #get the page in text_format
    page_fiction = requests.get(f"https://en.wikipedia.org/wiki/The_New_York_Times_Fiction_Best_Sellers_of_{i}").text
       
    #crerate the DF only with the table
    df_fiction = pd.read_html(page_fiction, flavor="bs4")[0]
    
    #create a new column with the year of the bestseller list
    df_fiction['Year']=np.nan
    df_fiction['Year'] = df_fiction['Year'].fillna(f"{i}")
    df_fiction['Genre']=np.nan
    df_fiction['Genre'] = df_fiction['Genre'].fillna("fiction")
    
    # build a csv-file
    df_fiction.to_csv(f"nyt_fiction_{i}.csv", index=False)
    
    # respectfull scraping
    wait_time = uniform(1.0, 2)
    sleep(wait_time)
    

1.3. Web Scraping all bestseller list - genre: non-fiction (New York Times bestseller - years 1931 to 2021

In [18]:
iterations_nonfiction = range(1931, 2022)

In [19]:
for i in iterations_nonfiction:
    #get the page in text_format
    page_nonfiction = requests.get(f"https://en.wikipedia.org/wiki/The_New_York_Times_Nonfiction_Best_Sellers_of_{i}").text
       
    #crerate the DF only with the table
    df_nonfiction = pd.read_html(page_nonfiction, flavor="bs4")[0]
    
    #create a new column with the year of the bestseller list
    df_nonfiction['Year']=np.nan
    df_nonfiction['Year'] = df_nonfiction['Year'].fillna(f"{i}")
    df_nonfiction['Genre']=np.nan
    df_nonfiction['Genre'] = df_nonfiction['Genre'].fillna("nonfiction")
    
    # build a csv-file
    df_nonfiction.to_csv(f"nyt_nonfiction_{i}.csv", index=False)
    
    # respectfull scraping
    wait_time = uniform(3.0, 2)
    sleep(wait_time)

1.3 Create a DataFrame and concat all files, but with steps between to check the rsults

In [20]:
# leads the jupyter notebook to the folder with the files
path = r''

# find all file names with the end 'csv'
all_files = glob.glob("*.csv")

# generate a list with all file_names
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

In [21]:
# check the results (because not all files are considered)
all_files

['nyt_fiction_1931.csv',
 'nyt_fiction_1932.csv',
 'nyt_fiction_1933.csv',
 'nyt_fiction_1934.csv',
 'nyt_fiction_1935.csv',
 'nyt_fiction_1936.csv',
 'nyt_fiction_1937.csv',
 'nyt_fiction_1938.csv',
 'nyt_fiction_1939.csv',
 'nyt_fiction_1940.csv',
 'nyt_fiction_1941.csv',
 'nyt_fiction_1942.csv',
 'nyt_fiction_1943.csv',
 'nyt_fiction_1944.csv',
 'nyt_fiction_1945.csv',
 'nyt_fiction_1946.csv',
 'nyt_fiction_1947.csv',
 'nyt_fiction_1948.csv',
 'nyt_fiction_1949.csv',
 'nyt_fiction_1950.csv',
 'nyt_fiction_1951.csv',
 'nyt_fiction_1952.csv',
 'nyt_fiction_1953.csv',
 'nyt_fiction_1954.csv',
 'nyt_fiction_1955.csv',
 'nyt_fiction_1956.csv',
 'nyt_fiction_1957.csv',
 'nyt_fiction_1958.csv',
 'nyt_fiction_1959.csv',
 'nyt_fiction_1960.csv',
 'nyt_fiction_1961.csv',
 'nyt_fiction_1962.csv',
 'nyt_fiction_1963.csv',
 'nyt_fiction_1964.csv',
 'nyt_fiction_1965.csv',
 'nyt_fiction_1966.csv',
 'nyt_fiction_1967.csv',
 'nyt_fiction_1968.csv',
 'nyt_fiction_1969.csv',
 'nyt_fiction_1970.csv',


In [22]:
# check the results (because not all files are considered)
len(all_files)

182

In [23]:
# check the results (because not all files are considered)
len(li)

182

In [24]:
# concat all files 
# separate the final step to create a file for all, because to check previous that all files are considered
frame = pd.concat(li, axis=0, ignore_index=True, sort=True)
frame.to_csv('new_york_times_bestseller_1931_2021.csv')