In [2]:
%matplotlib inline
from bs4 import BeautifulSoup
import urllib.request
import re
import numpy as np
import pandas as pd

In [63]:
vocab=pd.read_csv('gre_unfamiliar.csv',encoding='utf-8')

In [59]:
vocab.to_csv('gre_unfamiliar.csv',encoding='utf-8',index=False)

In [5]:
def add_new_col(df,name,default_val=np.NaN):
    df.loc[:, name] = default_val

In [6]:
def drop_unnamed_col(df):
    df.loc[:, ~df.columns.str.contains('^Unnamed')]
    return df

## Jul 23

In [61]:
vocab=vocab.head(10)

In [64]:
vocab

Unnamed: 0,vocabulary,chinese,meaning,example,short,long,definition
0,cosmopolitan,指人因见多识广而到哪里都悠然自得的,familiar with and at ease in many different co...,"his knowledge of French, Italian, and Spanish ...",,,
1,clamorous,,making a loud and confused noise,,,,
2,intransigent,指人固执己见的,characterized by refusal to compromise or to a...,,,,
3,numinous,神圣的,,"the strange, numinous beauty of this ancient l...",,,
4,sterling,(某人的努力成果/人格)优秀的,"(of a person or their work, efforts, or qualit...",this organization does sterling work for young...,,,
5,ad hoc,特别的/临时形成针对…的,"formed, arranged, or done for a particular pur...",an ad hoc committee (亚人委员会),,,
6,enigma,指人/事物：神秘的，困惑的，难以理解的,"a person or thing that is mysterious, puzzling...",,,,
7,neutralized,使之无效/使不能正常运作,render (something) ineffective or harmless by ...,impatience at his frailty began to neutralize ...,,,
8,fickle,善变的(贬义，指兴趣/忠诚变化无端),,"Web patrons are a notoriously fickle lot, boun...",,,
9,partiality,偏袒(某人或某事),unfair bias in favor of one thing or person co...,an attack on the partiality of judges,,,


In [9]:
vocab=drop_unnamed_col(vocab)

In [10]:
vocabList = vocab['vocabulary'].tolist()

In [11]:
baseurl='https://www.vocabulary.com/dictionary/'

In [12]:
EMPTY_CELL = 'missing'

In [65]:
vocab.drop('short',axis=1)
vocab.drop('long',axis=1)
vocab.drop('definition',axis=1)
add_new_col(vocab,'short','')
add_new_col(vocab,'long','')
add_new_col(vocab,'definition','')

## Test section

In [14]:
vocab = vocab.head(10)

In [44]:
vocab

Unnamed: 0,vocabulary,chinese,meaning,example,short,long,definition
0,cosmopolitan,指人因见多识广而到哪里都悠然自得的,familiar with and at ease in many different co...,"his knowledge of French, Italian, and Spanish ...","Your Aunt Eleanor, who's lived in six differen...",People who are cosmopolitan have an air of gla...,1. composed of people from or at home in many ...
1,clamorous,,making a loud and confused noise,,Clamorousmeans super loud and obnoxiously cryi...,"Clamorouscomes from the Latin rootclÄmĹr, me...",1. conspicuously and offensively loud; given t...
2,intransigent,指人固执己见的,characterized by refusal to compromise or to a...,,"Intransigentmeans inflexible, stubborn, entren...",Transhas to do with movement â thinktranspor...,"1. impervious to pleas, persuasion, requests, ..."
3,numinous,神圣的,,"the strange, numinous beauty of this ancient l...",Somethingnuminoushas a strong religious qualit...,"Numinouscomes from the Latinnumin-meaning ""div...",1. of or relating to or characteristic of a nu...
4,sterling,(某人的努力成果/人格)优秀的,"(of a person or their work, efforts, or qualit...",this organization does sterling work for young...,"Sterlingis British money, but it also describe...","Sterling is a word for British currency, and a...",1. highest in quality\n2. British money; espec...
5,ad hoc,特别的/临时形成针对…的,"formed, arranged, or done for a particular pur...",an ad hoc committee (亚人委员会),,,
6,enigma,指人/事物：神秘的，困惑的，难以理解的,"a person or thing that is mysterious, puzzling...",,Use the nounenigmato refer to something that i...,Traveling to English from Greek by means of th...,1. something that baffles understanding and ca...
7,neutralized,使之无效/使不能正常运作,render (something) ineffective or harmless by ...,impatience at his frailty began to neutralize ...,,,1. made neutral in some respect; deprived of d...
8,fickle,善变的(贬义，指兴趣/忠诚变化无端),,"Web patrons are a notoriously fickle lot, boun...",People who areficklechange their minds so much...,"Ficklecomes from the Old English wordficol, fo...",1. liable to sudden unpredictable change\n2. m...
9,partiality,偏袒(某人或某事),unfair bias in favor of one thing or person co...,an attack on the partiality of judges,Partialityis the habit of favoring something —...,Partialityis like bias. It means that your fav...,1. an inclination to favor one group or view o...


In [66]:
print(vocab.columns)
print(vocab.columns[6])

Index(['vocabulary', 'chinese', 'meaning', 'example', 'short', 'long',
       'definition'],
      dtype='object')
definition


In [95]:
restart_index=687

In [None]:

for index, entry in vocab.iterrows():
    
    if index < restart_index:
        continue
        
    restart_index = index
    
    # get the entry's word
    word = entry['vocabulary']
    
    # print record
    print("index " + str(index) + " | word: " + word)
    
    try:
        # Fetch HTML document and instantiate soup
        with urllib.request.urlopen(baseurl + word) as url:
            r=url.read()
    except Exception:
        continue
        
    soup = BeautifulSoup(r,'html.parser')

    # check if the word exists in vocabulary.com
    if len(soup.select('h1.dynamictext')) == 0: 
        continue
        
    # get contents we want
    shortContent = soup.select('p.short')
    longContent = soup.select('p.long')
    defContent = soup.select('h3.definition')
    
    word = soup.select('h1.dynamictext')

    # check if the word explanation exists
    if len(shortContent) > 0 and len(longContent)>0 :
        vocab.set_value(index,'short',shortContent[0].get_text(strip=True))
        vocab.set_value(index,'long',longContent[0].get_text(strip=True))
    
    contentStr=''
    for i in range(len(defContent)):
        # remove the <a> tag in definition
        defContent[i].a.extract()
        # the serial number of definition starts at 1 rather than 0
        contentStr += str(i+1) + '. ' + defContent[i].get_text(strip=True)+'\n'
    
    vocab.set_value(index,'definition',contentStr)
    
print('====================Finished=======================')

index 681 | word: hortatory


In [87]:
print(vocab['definition'][681])


