In [1]:
# Imports:
import pandas as pd

In [2]:
# Constants:
EXAMPLE_URL = "https://en.wiktionary.org/wiki/%D0%BD%D0%B0%D0%B9%D1%82%D0%B8"

In [5]:
df1 = pd.read_html(EXAMPLE_URL)

In [6]:
df1.shape

AttributeError: 'list' object has no attribute 'shape'

In [7]:
df1.columns

AttributeError: 'list' object has no attribute 'columns'

In [8]:
len(df1)

3

In [9]:
df1[0]

Unnamed: 0,0,1,2
0,Audio:,,(file)


In [10]:
df1[1]

Unnamed: 0,perfective aspect,perfective aspect.1,perfective aspect.2
0,infinitive,найти́ najtí,найти́ najtí
1,participles,present tense,past tense
2,active,—,наше́дший našédšij
3,passive,—,на́йденный nájdennyj
4,adverbial,—,"найдя́ najdjá, наше́дши našédši"
5,,present tense,future tense
6,1st singular (я),—,найду́ najdú
7,2nd singular (ты),—,найдёшь najdjóšʹ
8,3rd singular (он/она́/оно́),—,найдёт najdjót
9,1st plural (мы),—,найдём najdjóm


In [11]:
# Attempt to scrape with better parsing (e.g. same term, will be common for many languages; English & German, Portuguese & Spanish, Russian & Ukrainian, etc.)
df2 = pd.read_html("https://en.wiktionary.org/wiki/%D1%8F#Ukrainian")
print(type(df2))

<class 'list'>


In [12]:
len(df2)

16

In [13]:
df2[0].shape

(5, 3)

In [14]:
df2[0]

Unnamed: 0,0,1,2
0,,,
1,"я U+044F, &#1103; CYRILLIC SMALL LETTER YA ← ю...","я U+044F, &#1103; CYRILLIC SMALL LETTER YA ← ю...",
2,я,"U+044F, &#1103; CYRILLIC SMALL LETTER YA",
3,,,
4,← ю [U+044E],Cyrillic,ѐ → [U+0450]


In [17]:
# Some inspection shows that we're likely looking for a table whose class name includes the word "inflection"
df3 = pd.read_html("https://en.wiktionary.org/wiki/%D1%8F#Ukrainian", attrs={"class": "inflection-table"})
print(type(df3))

<class 'list'>


In [18]:
df3[0].shape

(999, 12)

In [19]:
df3[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,singular,singular,singular,singular,singular,,plural,plural,plural,,reflexive
1,,1st person,2nd person,3rd person,3rd person,3rd person,,1st person,2nd person,3rd person,,reflexive
2,,1st person,2nd person,m,n,f,,1st person,2nd person,3rd person,,reflexive
3,nominative,я (ja),ты (ty),ён (jon),яно́ (janó),яна́ (janá),,мы (my),вы (vy),яны́ (janý),,—
4,genitive,мяне́ (mjanjé),цябе́ (cjabjé),яго́ (jahó),яго́ (jahó),яе́ (jajé),,нас (nas),вас (vas),іх (ix),,сябе́ (sjabjé)


In [35]:
# trying with lxml so we can grab our preferred language and have some more control over what we extract from the tree.
try:
    from lxml import etree
    print("running with lxml.etree")
except ImportError:
    import xml.etree.ElementTree as etree
    print("running with Python's xml.etree.ElementTree")

running with lxml.etree


In [21]:
import requests

In [22]:
result = requests.get("https://en.wiktionary.org/api/rest_v1/page/title/Earth")

In [23]:
result

<Response [200]>

In [25]:
result = requests.get("https://en.wiktionary.org/api/rest_v1/page/title/я")

In [30]:
r = requests.get("https://en.wiktionary.org/api/rest_v1/page/html/%D1%8F")
r

<Response [200]>

In [34]:
len(r.text)

797397

In [36]:
et = etree.ElementTree(r.text)

TypeError: Argument 'element' has incorrect type (expected lxml.etree._Element, got str)

In [38]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(r.content, "lxml")
inflection_tables = soup.find_all("table", class_=lambda c: c and "inflection" in c)

In [39]:
type(inflection_tables)

bs4.element.ResultSet

In [40]:
len(inflection_tables)

9

In [42]:
type(inflection_tables[0])

bs4.element.Tag

In [44]:
dfs = [pd.read_html(str(table))[0] for table in inflection_tables]

  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]
  dfs = [pd.read_html(str(table))[0] for table in inflection_tables]


In [45]:
dfs[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,singular,singular,singular,singular,singular,,plural,plural,plural,,reflexive
1,,1st person,2nd person,3rd person,3rd person,3rd person,,1st person,2nd person,3rd person,,reflexive
2,,1st person,2nd person,m,n,f,,1st person,2nd person,3rd person,,reflexive
3,nominative,я (ja),ты (ty),ён (jon),яно́ (janó),яна́ (janá),,мы (my),вы (vy),яны́ (janý),,—
4,genitive,мяне́ (mjanjé),цябе́ (cjabjé),яго́ (jahó),яго́ (jahó),яе́ (jajé),,нас (nas),вас (vas),іх (ix),,сябе́ (sjabjé)
...,...,...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,,,
995,,,,,,,,,,,,
996,,,,,,,,,,,,
997,,,,,,,,,,,,


In [46]:
dfs[1]

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,nominative (subject),accusative (direct complement),accusative (direct complement),dative (indirect complement),dative (indirect complement),prepositional
Unnamed: 0_level_1,Unnamed: 0_level_1.1,Unnamed: 1_level_1,Unnamed: 2_level_1,nominative (subject),full,short,full,short,prepositional
0,singular,first person,first person,аз (az),мен (men) ме́не (méne),ме (me),ме́не (méne),ми (mi),мен (men) ме́не (méne)
1,singular,second person,informal,ти (ti),теб (teb) те́бе (tébe),те (te),те́бе (tébe),ти (ti),теб (teb) те́бе (tébe)
2,singular,second person,formal,Ви́е (Víe),Вас (Vas),Ви (Vi),Вам (Vam),Ви (Vi),Вас (Vas)
3,singular,third person,masculine,той (toj),не́го (négo),го (go),не́му (nému),му (mu),не́го (négo)
4,singular,third person,feminine,тя (tja),не́я (néja),я (ja),ней (nej),ѝ (ì),не́я (néja)
5,singular,third person,neuter,то (to),не́го (négo),го (go),не́му (nému),му (mu),не́го (négo)
6,plural,first person,first person,ни́е (níe) ний (nij),нас (nas),ни (ni),нам (nam),ни (ni),нас (nas)
7,plural,second person,informal,ви́е (víe) вий (vij),вас (vas),ви (vi),вам (vam),ви (vi),вас (vas)
8,plural,second person,formal,Ви́е (Víe),Вас (Vas),Ви (Vi),Вам (Vam),Ви (Vi),Вас (Vas)
9,plural,third person,third person,те (te),тях (tjah),ги (gi),тям (tjam),им (im),тях (tjah)


In [47]:
dfs[2]

Unnamed: 0,0,1
0,Arabic,يا
1,Cyrillic,я
2,Latin,ia
3,Yañalif,ə


In [48]:
dfs[3]

Unnamed: 0_level_0,Unnamed: 0_level_0,singular,singular,singular,singular,singular,Unnamed: 6_level_0,plural,plural,plural,Unnamed: 10_level_0,reflexive
Unnamed: 0_level_1,Unnamed: 0_level_1,1st person,2nd person,3rd person,3rd person,3rd person,Unnamed: 6_level_1,1st person,2nd person,3rd person,Unnamed: 10_level_1,reflexive
Unnamed: 0_level_2,Unnamed: 0_level_2.1,1st person,2nd person,m,n,f,Unnamed: 6_level_2,1st person,2nd person,3rd person,Unnamed: 10_level_2,reflexive
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,,,,,,,,,
5,,,,,,,,,,,,
6,nominative,я (ja),ти (ti),вон (von),воно (vono),вона (vona),ми (mi),ви (vi),вони (voni),,,
7,genitive,"мнє, ме 1 ‎(mnje, me)","тебе, це 1 ‎(tebe, ce)","його 1, нього 2, ньго 2, го 1 (joho, nʹoho, nʹ...","його 1, нього 2, ньго 2, го 1 (joho, nʹoho, nʹ...","єй 1, нєй 2 ‎(jej, njej)",нас ‎(nas),вас (vas),"їх 1, нїх 2, их 3 (jix, njix, ix)",себе (sebe),,
8,dative,"мнє, ми (mnje, mi)","тебе, ци ‎(tebe, ci)","йому 4, ньому 2, му ‎(jomu, nʹomu, mu)","йому 4, ньому 2, му ‎(jomu, nʹomu, mu)","єй, нєй 2 ‎(jej, njej)",нам ‎(nam),вам (vam),"їм 4, нїм 2, им 3 (jim, njim, im)",себе (sebe),,
9,accusative,"мнє, ме ‎(mnje, me)","тебе, це ‎(tebe, ce)","його 4, нього 2, ньго 2, го (joho, nʹoho, nʹho...","його 4, нього 2, ньго 2, го (joho, nʹoho, nʹho...","ю, ню 2 ‎(ju, nju)",нас ‎(nas),вас (vas),"їх 4, нїх 2, их 3 (jix, njix, ix)",себе (sebe),,


In [49]:
dfs[4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,singular,singular,singular,singular,singular,,plural,plural,plural,,reflexive
1,,1st person,2nd person,3rd person,3rd person,3rd person,,1st person,2nd person,3rd person,,reflexive
2,,1st person,2nd person,m,n,f,,1st person,2nd person,3rd person,,reflexive
3,nominative,я (ja),ты (ty),он (on),оно́ (onó),она́ (oná),,мы (my),вы (vy),они́ 1 (oní),,—
4,genitive,меня́ (menjá),тебя́ (tebjá),"его́ 2, него́ 2 3 (jevó, nevó)","его́ 2, него́ 2 3 (jevó, nevó)","её 4, неё 3 4 (jejó, nejó)",,нас (nas),вас (vas),"их, них 3 (ix, nix)",,себя́ (sebjá)
5,dative,мне (mne),тебе́ (tebé),"ему́, нему́ 3 (jemú, nemú)","ему́, нему́ 3 (jemú, nemú)","ей, ней 3 (jej, nej)",,нам (nam),вам (vam),"им, ним 3 (im, nim)",,себе́ (sebé)
6,accusative,меня́ (menjá),тебя́ (tebjá),"его́ 2, него́ 2 3 (jevó, nevó)","его́ 2, него́ 2 3 (jevó, nevó)","её, неё 3 (jejó, nejó)",,нас (nas),вас (vas),"их, них 3 (ix, nix)",,себя́ (sebjá)
7,instrumental,"мной, мно́ю 5 (mnoj, mnóju)","тобо́й, тобо́ю 5 (tobój, tobóju)","им, ним 3 (im, nim)","им, ним 3 (im, nim)","ей, ней 3, е́ю 5, не́ю 3 5 (jej, nej, jéju, néju)",,на́ми (námi),ва́ми (vámi),"и́ми, ни́ми 3 (ími, ními)",,"собо́й, собо́ю 5 (sobój, sobóju)"
8,prepositional 6,мне (mne),тебе́ (tebé),нём 3 (njom),нём 3 (njom),ней 3 (nej),,нас (nas),вас (vas),них 3 (nix),,себе́ (sebé)


In [50]:
rus_inflections = dfs[4]

In [53]:
rus_inflections = rus_inflections.dropna(how="all")

In [54]:
rus_inflections.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [55]:
rus_inflections.iloc[:, 0]

0                NaN
1                NaN
2                NaN
3         nominative
4           genitive
5             dative
6         accusative
7       instrumental
8    prepositional 6
Name: 0, dtype: object

In [58]:
rus_inflections[rus_inflections.iloc[:, 0].astype(str).str.lower().str.contains("nominative")]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
3,nominative,я (ja),ты (ty),он (on),оно́ (onó),она́ (oná),,мы (my),вы (vy),они́ 1 (oní),,—


In [65]:
def extract_case(inflections_df: pd.DataFrame, case: str) -> pd.DataFrame:
    """Given a table of inflections for a noun or adjective, over all 
    cases and persons, extract the inflections for each person in that
    case. e.g. if given an inflection table for Ukrainian and a case
    value of "nominative," returns a one-row `DataFrame` with all
    nominative case forms of the noun/adjective.
    """
    # Cleanup, maybe best to move to a separate function called earlier
    # at some point in the future.
    inflections_df = inflections_df.dropna(how="all")
    boolean_indexer = inflections_df.iloc[:, 0] \
        .astype(str).str.lower().str.contains(case.lower())
    declensions = inflections_df[boolean_indexer]
    return declensions

In [69]:
genitive_pronouns = extract_case(rus_inflections, "genitive")
genitive_pronouns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
4,genitive,меня́ (menjá),тебя́ (tebjá),"его́ 2, него́ 2 3 (jevó, nevó)","его́ 2, него́ 2 3 (jevó, nevó)","её 4, неё 3 4 (jejó, nejó)",,нас (nas),вас (vas),"их, них 3 (ix, nix)",,себя́ (sebjá)


In [67]:
type(genitive_pronouns)

pandas.core.frame.DataFrame

In [71]:
genitive_pronouns.values.flatten()

array(['genitive', 'меня́ (menjá)', 'тебя́ (tebjá)',
       'его́\u202f2, него́\u202f2\xa03 (jevó, nevó)',
       'его́\u202f2, него́\u202f2\xa03 (jevó, nevó)',
       'её\u202f4, неё\u202f3\xa04 (jejó, nejó)', nan, 'нас (nas)',
       'вас (vas)', 'их, них\u202f3 (ix, nix)', nan, 'себя́ (sebjá)'],
      dtype=object)

In [72]:
genitive_pronouns.stack()

4  0                           genitive
   1                      меня́ (menjá)
   2                      тебя́ (tebjá)
   3     его́ 2, него́ 2 3 (jevó, nevó)
   4     его́ 2, него́ 2 3 (jevó, nevó)
   5         её 4, неё 3 4 (jejó, nejó)
   7                          нас (nas)
   8                          вас (vas)
   9                их, них 3 (ix, nix)
   11                     себя́ (sebjá)
dtype: object

In [73]:
genitive_pronouns.stack().dropna()

4  0                           genitive
   1                      меня́ (menjá)
   2                      тебя́ (tebjá)
   3     его́ 2, него́ 2 3 (jevó, nevó)
   4     его́ 2, него́ 2 3 (jevó, nevó)
   5         её 4, неё 3 4 (jejó, nejó)
   7                          нас (nas)
   8                          вас (vas)
   9                их, них 3 (ix, nix)
   11                     себя́ (sebjá)
dtype: object

In [78]:
import re

PARENTHESIZE = re.compile("""\(.*\)""")

  PARENTHESIZE = re.compile("""\(.*\)""")


In [79]:
re.sub(PARENTHESIZE, "", "их, них 3 (ix, nix)")

'их, них\u202f3 '

In [96]:
def extract_case(inflections_df: pd.DataFrame, case: str) -> pd.DataFrame:
    """Given a table of inflections for a noun or adjective, over all 
    cases and persons, extract the inflections for each person in that
    case. e.g. if given an inflection table for Ukrainian and a case
    value of "nominative," returns a one-row `DataFrame` with all
    nominative case forms of the noun/adjective.
    """
    # Cleanup, maybe best to move to a separate function called earlier
    # at some point in the future.
    # Remove rows that are all NaN
    inflections_df = inflections_df.dropna(how="all")

    # Filter for the target case in the first column
    boolean_indexer = inflections_df.iloc[:, 0] \
        .astype(str).str.lower().str.contains(case.lower())
    declensions = inflections_df[boolean_indexer]

    # Remove any parenthesized content (e.g., pronunciations like "(jevó)")
    declensions = declensions.map(
        lambda x: re.sub(r"\s*\([^)]*\)", "", str(x)) if pd.notna(x) else x
    )

    return declensions

In [97]:
genitive_pronouns_2 = extract_case(rus_inflections, "genitive")
genitive_pronouns_2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
4,genitive,меня́,тебя́,"его́ 2, него́ 2 3","его́ 2, него́ 2 3","её 4, неё 3 4",,нас,вас,"их, них 3",,себя́


In [98]:
r.raise_for_status()