In [1]:
import os
import subprocess
import tarfile
import urllib

import checker

Download and create the data files by running the code below.

In [2]:
mini_20newsgroup_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases"
    "/20newsgroups-mld/mini_newsgroups.tar.gz")

with urllib.request.urlopen(mini_20newsgroup_url) as response:
    page_data = response.read()
    with open('newsgroups.tar.gz', 'wb') as fout:
        fout.write(page_data)
    os.makedirs('data', exist_ok=True)
    with tarfile.open('newsgroups.tar.gz') as tfin:
        tfin.extractall(path='data')

In [3]:
base_data_dir = os.path.join(os.getcwd(), 'data')
newsgroups_data_dir = os.path.join(base_data_dir, 'mini_newsgroups')

Several folders should now be visible under "mini_newsgroups"

In [4]:
os.listdir(newsgroups_data_dir)

['talk.politics.mideast',
 'rec.autos',
 'comp.sys.mac.hardware',
 'alt.atheism',
 'rec.sport.baseball',
 'comp.os.ms-windows.misc',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.med',
 'talk.politics.misc',
 'rec.motorcycles',
 'comp.windows.x',
 'comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'sci.electronics',
 'talk.politics.guns',
 'sci.space',
 'soc.religion.christian',
 'misc.forsale',
 'talk.religion.misc']

In [5]:
files = []
for (dirpath, dirnames, filenames) in os.walk(newsgroups_data_dir):
    files.extend([os.path.join(dirpath, file) for file in filenames])

## Problem 1: Create document labels

In order to do classification of emails by topic, each email needs to be labelled with the news groups it was sent in. The news group of each email corresponds to the folder (additional newsgroups are availabe within the email but will be ignored) the email file is in. The labels will be a number 0-19 representing the newsgroups in sorted order, i.e.

    alt.atheism -> 0
    comp.graphics -> 1
    ...
    talk.politics.misc -> 18
    talk.religion.misc -> 19

Build a Docker image called `newsgroups` that produces a CSV file `data/labels.csv` in the format shown below when run with the arguments `label.py`:

    filename,label
    alt.atheism/51121,0
    ...

In [20]:
import pandas as pd
files = os.listdir(newsgroups_data_dir)

In [28]:
df['filename'] = pd.DataFrame(files)

In [38]:
df['label'] = list(range(len(files)))

In [42]:
pwd

'/Users/mosadoluwaobatusin/Documents/Projects/Anidata/tutorials/data-engineering'

In [43]:
df.to_csv('./data/labels.csv')

In [7]:
# Run Docker container based on image `newsgroups`
command = "docker run --rm -v {0}:/data newsgroups label.py".format(base_data_dir)
result = subprocess.run(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Output result of running command
print("Return code (0 is good): " + str(result.returncode))
print("stdout:\n" + result.stdout.decode("utf-8"))
print("stderr:\n" + result.stderr.decode("utf-8"))

Return code (0 is good): 125
stdout:

stderr:
Unable to find image 'newsgroups:latest' locally
docker: Error response from daemon: pull access denied for newsgroups, repository does not exist or may require 'docker login'.
See 'docker run --help'.



In [None]:
checker.csv_match('data/labels.csv', 'solutions/labels.csv')
print("Everything looks good!")

## Problem 2: Email as bag-of-words

Convert each email into a bag-of-words representation. This kind of representation is frequently used to represent text documents in preparation for classification models. Produce a `npz` file which contains one row per email (emails sorted alphabetically according file path) and one column per unique word (sorted alphabetically) with the value being the number of times the word was used in the email. You will need to stored this table as a sparse matrix.

Update the Docker image from Problem 1 so that it produces a `npz` file at `data/count.npz` when call with the argument `bag.py`.

See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html how to save a sparse matrix as an `npz` file.

NOTE: If you are seeing some kind of decoding error (e.g. `UnicodeDecodeError`) the files are encoded as `latin-1`.

In [None]:
# Run Docker container based on image `newsgroups`
command = "docker run --rm -v {0}:/data newsgroups bag.py".format(base_data_dir)
result = subprocess.run(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Output result of running command
print("Return code (0 is good): " + str(result.returncode))
print("stdout:\n" + result.stdout.decode("utf-8"))
print("stderr:\n" + result.stderr.decode("utf-8"))

In [None]:
checker.sparse_npz_match('data/count.npz', 'solutions/count.npz')
print("Everything looks good!")