Skip to content

Commit

Permalink
Clean up chapters
Browse files Browse the repository at this point in the history
  • Loading branch information
Douwe Osinga committed Jan 16, 2018
1 parent baf1af3 commit 6dd4900
Show file tree
Hide file tree
Showing 13 changed files with 2,484 additions and 1,667 deletions.
710 changes: 710 additions & 0 deletions 03.1 Using pre build word embeddings.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -39,33 +39,44 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"index = requests.get('https://dumps.wikimedia.org/enwiki/').text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"soup_index = BeautifulSoup(index, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['20171001/',\n",
" '20171020/',\n",
" '20171103/',\n",
" '20171120/',\n",
" '20171201/',\n",
" '20171220/',\n",
" '20180101/']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dumps = [a['href'] for a in soup_index.find_all('a') \n",
" if a.has_attr('href') and a.text[:-1].isdigit()]\n",
Expand All @@ -81,14 +92,14 @@
"outputs": [],
"source": [
"for dump_url in sorted(dumps, reverse=True):\n",
" print(dump_url)\n",
" dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text\n",
" soup_dump = BeautifulSoup(dump_html, 'html.parser')\n",
" pages_xml = [a['href'] for a in soup_dump.find_all('a') \n",
" if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]\n",
" if pages_xml:\n",
" break\n",
" time.sleep(0.8)"
" print(dump_url)\n",
" dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text\n",
" soup_dump = BeautifulSoup(dump_html, 'html.parser')\n",
" pages_xml = [a['href'] for a in soup_dump.find_all('a') \n",
" if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]\n",
" if pages_xml:\n",
" break\n",
" time.sleep(0.8)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"with open('data/wp_movies_10k.ndjson') as fin:\n",
Expand Down Expand Up @@ -109,7 +111,9 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def movie_embedding_model(embedding_size=30):\n",
Expand Down Expand Up @@ -386,7 +390,9 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])\n",
Expand Down Expand Up @@ -515,7 +521,9 @@
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"gross_y = np.asarray([gr for gr in movie_gross if gr])\n",
Expand Down
695 changes: 0 additions & 695 deletions 1.1 Using pre build word embeddings.ipynb

This file was deleted.

Loading

0 comments on commit 6dd4900

Please sign in to comment.