# Web Scraping with Pandas

In [1]:
import pandas as pd

In [2]:
novelists = pd.read_html("https://en.wikipedia.org/wiki/List_of_Russian-language_novelists")
novelists

[                    Portrait                            Author  \
 0                        NaN         Fyodor Abramov(1920–1983)   
 1                        NaN              M. Ageyev(1898–1973)   
 2                        NaN      Chinghiz Aitmatov(1928–2008)   
 3                        NaN           David Aizman(1869-1922)   
 4                        NaN         Sergey Aksakov(1791–1859)   
 5                        NaN        Vasily Aksyonov(1932–2009)   
 6                        NaN           Boris Akunin(born 1956)   
 7                        NaN          Mikhail Albov(1851-1911)   
 8                        NaN        Yuz Aleshkovsky(born 1929)   
 9                        NaN  Alexander Amfiteatrov(1862–1938)   
 10                       NaN        Daniil Andreyev(1906–1959)   
 11                       NaN        Leonid Andreyev(1871–1919)   
 12                       NaN          Yury Annenkov(1889–1974)   
 13                       NaN       Aleksey Apukhtin(1840–1893

It returns a list with all the tables from the site converted into DataFrames.

In [3]:
print(type(novelists))
print(type(novelists[0]))
print(type(novelists[-1]))

<class 'list'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [4]:
novelists[0].sample(3)

Unnamed: 0,Portrait,Author,Notable works,Illustration,Illustration.1
14,,Maria Arbatova(born 1957),My name is Woman,Maria Arbatova in 2009.,
10,,Daniil Andreyev(1906–1959),Roza Mira,Daniil's parents Leonid Andreyev and Countess ...,
11,,Leonid Andreyev(1871–1919),DarknessThe AbyssThe Red LaughThe Seven Who We...,1909 book cover,


In [5]:
novelists[21].sample(3)

Unnamed: 0,Portrait,Author,Notable works,Illustration,Illustration.1
4,,Ilia Zdanevich(1894–1975),,,
7,,Boris Zhitkov(1882–1938),Viktor Vavich,,
9,,Zinovy Zinik(born 1945),The Mushroom-Picker,,


In [6]:
novelists[0].loc[0]

Portrait                                                        NaN
Author                                    Fyodor Abramov(1920–1983)
Notable works     The New LifeWooden HorsesTwo Winters and Three...
Illustration                     Memorial plaque in St. Petersburg.
Illustration.1                                                  NaN
Name: 0, dtype: object

In [7]:
novelists[21].loc[0]

Portrait                                  NaN
Author            Mikhail Zagoskin(1789–1852)
Notable works        Tales of Three Centuries
Illustration           Engraving of Zagoskin.
Illustration.1                            NaN
Name: 0, dtype: object

In [8]:
novelists[0].loc[0,'Author']

'Fyodor Abramov(1920–1983)'

In [9]:
novelists[21].loc[0,'Author']

'Mikhail Zagoskin(1789–1852)'

Get the Birthyear in a different column:

In [10]:
novelists = pd.concat(novelists[0:22], axis=0)\
        .drop(columns=['Portrait', 'Illustration','Illustration.1'])\
        .reset_index(drop=True)

novelists

Unnamed: 0,Author,Notable works
0,Fyodor Abramov(1920–1983),The New LifeWooden HorsesTwo Winters and Three...
1,M. Ageyev(1898–1973),Novel with Cocaine
2,Chinghiz Aitmatov(1928–2008),JamilyaThe White ShipThe Day Lasts More Than a...
3,David Aizman(1869-1922),The Countrymen
4,Sergey Aksakov(1791–1859),The Scarlet FlowerThe Family ChronicleYears of...
...,...,...
300,Maria Zhukova(1804–1855),Evenings on the Karpovka
301,Zinovy Zinik(born 1945),The Mushroom-Picker
302,Lydia Zinovieva-Annibal(1866–1907),The Tragic Menagerie
303,Nikolay Zlatovratsky(1845–1911),Old Shadows


In [11]:
novelists['Birthyear'] = novelists['Author'].str.split(')')\
                        .apply(lambda x: x[0])\
                        .str.replace('[^0-9]','').str[:4]

novelists['Author'] = novelists['Author'].str.split('(')\
                        .apply(lambda x:x[0])\
                        .str[:]                       

In [12]:
novelists

Unnamed: 0,Author,Notable works,Birthyear
0,Fyodor Abramov,The New LifeWooden HorsesTwo Winters and Three...,1920
1,M. Ageyev,Novel with Cocaine,1898
2,Chinghiz Aitmatov,JamilyaThe White ShipThe Day Lasts More Than a...,1928
3,David Aizman,The Countrymen,1869
4,Sergey Aksakov,The Scarlet FlowerThe Family ChronicleYears of...,1791
...,...,...,...
300,Maria Zhukova,Evenings on the Karpovka,1804
301,Zinovy Zinik,The Mushroom-Picker,1945
302,Lydia Zinovieva-Annibal,The Tragic Menagerie,1866
303,Nikolay Zlatovratsky,Old Shadows,1845
