In [1]:
import hashlib # for grading

# Standard imports
import numpy as np
from numpy.testing import assert_allclose, assert_almost_equal
import pandas as pd
import re
import string
import math
import warnings; warnings.simplefilter('ignore')

# NLTK imports
import nltk
nltk.download('stopwords')

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/francisco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Q1. S&P 500 Companies

For the first question, you will be making use of regex. In particular, you have a list of companies currently in the S&P 500, their [stock tickers](https://en.wikipedia.org/wiki/Ticker_symbol) (an abbreviation used to uniquely identify publicly traded shares of a particular stock on a particular stock market), and their industries, and you'll have to answer some very specific questions about that list.

Start by loading the data into a list:


In [2]:
path = "data/SP500.txt"
companies = []
with open(path, 'r', encoding='utf-8') as f:
    companies = [l.strip() for l in f.readlines()]

In [3]:
# check the format
companies[:5]

['3M Company (MMM) -- Industrials',
 'Abbott Laboratories (ABT) -- Health Care',
 'AbbVie Inc. (ABBV) -- Health Care',
 'ABIOMED Inc (ABMD) -- Health Care',
 'Accenture plc (ACN) -- Information Technology']

In the first item, for example, `3M Company` is the company name, `MMM` is the ticker symbol, and `Industrials` is the industry.

#### Q1.a)

First, we want to know which companies belong to the Real Estate or Health Care sectors. Return the full strings that include these companies in a list assigned to a variable `ans`.

In [4]:
# ans = [ ... ]
pattern = ".*Real Estate$|.*Health Care$"
ans =[]
for text in companies :
    b=re.findall(pattern, text)
    if b!=[]:
        ans.append(b[0])


In [5]:
assert len(ans) == 93
assert hashlib.sha256(' '.join(ans).encode()).hexdigest() == \
    'b25ef38e29cc7d975a651e93fc201b9a83cfdb35a0a79d6068d1e29325d3fa8f'

#### Q1.b)

Next, among all companies, find the companies that start with an initial consisting of a capital letter followed by a period (e.g. `A.`). Return a list of the companies (the full strings) in the variable `ans_initials`.

In [6]:
pattern = "^[A-Z]{1}\..*"
ans_initials =[]
for text in companies :
    b=re.findall(pattern, text)
    if b!=[]:
        ans_initials.append(b[0])
ans_initials

['A.O. Smith Corp (AOS) -- Industrials',
 'C. H. Robinson Worldwide (CHRW) -- Industrials',
 'D. R. Horton (DHI) -- Consumer Discretionary',
 'J. B. Hunt Transport Services (JBHT) -- Industrials',
 'T. Rowe Price Group (TROW) -- Financials',
 'U.S. Bancorp (USB) -- Financials',
 'W. R. Berkley Corporation (WRB) -- Financials']

In [7]:
print("Number of companies starting with an initial: " , len(ans_initials))
assert 'A.O. Smith Corp (AOS) -- Industrials' in ans_initials
assert 'D. R. Horton (DHI) -- Consumer Discretionary' in ans_initials
assert 'Arthur J. Gallagher & Co. (AJG) -- Financials' not in ans_initials
assert 'Berkshire Hathaway (BRK.B) -- Financials' not in ans_initials
assert hashlib.sha256(' '.join(ans_initials).encode()).hexdigest() == \
    '999cbcf37711021cecdda234015f87d8785f0d25f1200d43be4cf0ac7239aaa7'
assert len(ans_initials) == 7

Number of companies starting with an initial:  7


#### Q1.c)

Now, extract only the company names whose stock tickers contain just a single letter. For example, if the string looks like `Lisbon Data Science Academy (L) -- Education`, return just `Lisbon Data Science Academy`. Store the company names as a list called `ans_single`.

For an extra challenge, try to do this using just one regex pattern. You may want to use `re.search()` and read about [capturing groups](https://docs.python.org/3/howto/regex.html#grouping), and don't forget you can use tools like https://regex101.com/ to test your regexes. 

In [8]:
# ans_single = [ ... ]
ans_single =[]

for text in companies :
    p=re.compile('(.*)(\s)\([A-Z]\)')
    b=p.search(text)
    if b != None :
        ans_single.append(b.group(1))
    
ans_single

['Agilent Technologies Inc',
 'AT&T Inc.',
 'Citigroup Inc.',
 'Dominion Energy',
 'Ford Motor Company',
 'Jacobs Engineering Group',
 'Kellogg Co.',
 'Loews Corp.',
 'Realty Income Corporation',
 'Visa Inc.']

In [9]:
assert len(ans_single) == 10
assert hashlib.sha256(' '.join(ans_single).encode()).hexdigest() == 'b449c503fabbec88da870a63b6bda074496741113d7c04f204ad597a31ca23fb'

## Q2. Sports News (preprocessing)

Here is a subset of data taken from the 20 Newsgroups dataset, a classic text classification dataset, which we can download directly from [scikit-learn](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset). To keep things simple, we will only be focusing on two of the categories, `rec.sport.baseball` and `rec.sport.hockey`. Our goal will be to classify whether news articles are about the sport of baseball or hockey.

First, let's prepare the data:

In [10]:
# This is how the data was originally downloaded

# from sklearn.datasets import fetch_20newsgroups

# categories = [
#  'rec.sport.baseball',
#  'rec.sport.hockey',
# ]

# # returns a list of strings X representing the articles to classify, and the category labels y as a numpy array
# X, y = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"),
#                           categories=categories, return_X_y=True)

In [11]:
# We will load the data from pickle files instead

import pickle

with open("data/20_newsgroups_baseball_hockey_X.pkl", "rb") as f:
    X = pickle.load(f)
with open("data/20_newsgroups_baseball_hockey_y.pkl", "rb") as f:
    y = pickle.load(f)

Let's check the data size and distribution of classes:

In [12]:
def get_data_stats(X, y):
    print(f"Size of dataset: {len(X)}")
    unique, counts = np.unique(y, return_counts=True)
    print(f"Distribution of classes: {dict(zip(unique, counts))}")

get_data_stats(X, y)

Size of dataset: 1993
Distribution of classes: {0: 994, 1: 999}


Since the classes are evenly distributed, we can use a regular train/dev/test split. We'll use a dev and test size of 10% of the full dataset.

**Note**: So far you've using the `train`/`val`/`test` nomenclature for naming variables related to training, validation and test sets, respectively. `dev` is short for "development" and is just another typicaly identifier for the validation set, and we'll use it throghout this notebook instead of `val`

In [13]:
# train dev test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f"Train size: {len(X_train)}\nDev size: {len(X_dev)}\nTest size: {len(X_test)}")

Train size: 1594
Dev size: 199
Test size: 200


Since the goal is to turn the strings on X into useful features, now we will be performing common preprocessing operations on the texts.

#### Q2.a)

First tokenize the data. Implement the function to receive a list of strings and an NLTK-style tokenizer, and return the list but with tokenized strings.

In [14]:
x="".join(X_train) 
x



In [15]:
def apply_tokenizer(data, tokenizer):
    """
    Returns a list of sentences that have been tokenized with the provided `tokenizer`
    
    E.g. for an input ["This is a test!", "No, it can't be"],
         it should return ["This is a test !", "No , it can ' t be"]
    
    Args:
    data - list of strings containing the text to tokenize
    tokenizer - nltk tokenizer
    """
    data_tok=[]
    for s in data:
        b=tokenizer.tokenize(s)
        data_ = " ".join(b)
        data_tok.append(data_)
        
    
    return data_tok
    #tokenizer.tokenize(data)
   

In [16]:
tokenizer = WordPunctTokenizer()
data_tok = apply_tokenizer(X_train, tokenizer)
data_tok

['',
 "I ' m the keeper of the stats for a family hockey pool and I ' m looking for daily / weekly email servers for playoff stats . I ' ve connected with the servers at J . Militzok @ skidmore . EDU and wilson @ cs . ucf . edu . I ' m still sorting these two out . Are there others ? Email please as my site doesn ' t get this group . Thanks . Rick",
 "The event that had the most impact on Gaetti ' s career was his leg injury in 1988 . His performance dropped radically from 1988 to 1989 . He was still with the Twins in 1989 and 1990 , but if you look at his stats ( both offensive and defensive ), he never has come back to his pre - injury level .",
 'Hartford 1 1 3 -- 5 NY Rangers 1 2 1 -- 4 First period 1 , Hartford , Cunneyworth 5 ( Janssens , Greig ) 12 : 21 . 2 , NY Rangers , Graves 34 ( Turcotte , Zubov ) 18 : 39 . Second period 3 , NY Rangers , Kovalev 19 ( Turcotte , Graves ) 2 : 12 . 4 , Hartford , Sanderson 44 ( Cassels ) pp , 4 : 54 . 5 , NY Rangers , Amonte 30 ( Andersson , V

In [17]:
len(data_tok)

1594

In [18]:
bla=[w for s in data_tok for w in s.split(" ")]
len(bla)

333566

In [19]:
tokenizer = WordPunctTokenizer()
data_tok = apply_tokenizer(X_train, tokenizer)

assert len(data_tok) == 1594
assert isinstance (data_tok, list)
assert all(isinstance(sentence, str) for sentence in data_tok)

assert len([w for s in data_tok for w in s.split(" ")]) == 333566
assert hashlib.sha256(data_tok[1234].encode()).hexdigest() == \
    'bd70c45292aafb1430f3dae58dd7b3732ff2ddebc8acb64976a38efbe7945215'
assert hashlib.sha256(data_tok[567].encode()).hexdigest() == \
    '0de358e31d950ef321b2f3d762b525a8ab1e65e0a2c392633e6984e40253f2e4'

#### Q2.b)

The second step you will implement is lowercasing the data.

In [20]:
data_tok

['',
 "I ' m the keeper of the stats for a family hockey pool and I ' m looking for daily / weekly email servers for playoff stats . I ' ve connected with the servers at J . Militzok @ skidmore . EDU and wilson @ cs . ucf . edu . I ' m still sorting these two out . Are there others ? Email please as my site doesn ' t get this group . Thanks . Rick",
 "The event that had the most impact on Gaetti ' s career was his leg injury in 1988 . His performance dropped radically from 1988 to 1989 . He was still with the Twins in 1989 and 1990 , but if you look at his stats ( both offensive and defensive ), he never has come back to his pre - injury level .",
 'Hartford 1 1 3 -- 5 NY Rangers 1 2 1 -- 4 First period 1 , Hartford , Cunneyworth 5 ( Janssens , Greig ) 12 : 21 . 2 , NY Rangers , Graves 34 ( Turcotte , Zubov ) 18 : 39 . Second period 3 , NY Rangers , Kovalev 19 ( Turcotte , Graves ) 2 : 12 . 4 , Hartford , Sanderson 44 ( Cassels ) pp , 4 : 54 . 5 , NY Rangers , Amonte 30 ( Andersson , V

In [21]:
def apply_lowercase(data):
    """
    Returns a list of strings, with all the tokens lowercased.
    
    Args:
    data - list of strings to be lowercased
    """
    data_tok_lc=[]
    for s in data:
        b=s.lower()
        data_tok_lc.append(b)
        
        
        
    
    return data_tok_lc
    

In [22]:
data_tok_lc = apply_lowercase(data_tok)


In [23]:
data_tok_lc = apply_lowercase(data_tok)

assert len(data_tok_lc) == 1594
assert isinstance (data_tok, list)
assert all(isinstance(sentence, str) for sentence in data_tok)

assert len([w for s in data_tok_lc for w in s.split(" ")]) == 333566
assert hashlib.sha256(data_tok_lc[1234].encode()).hexdigest() == \
    'e12bd8bec884721329792d49085e8e6b268c8129da7ed638b06ccdf3ea49c7a5'
assert hashlib.sha256(data_tok_lc[567].encode()).hexdigest() == \
    '4476776aa3ea52c7580c592bfeb0e2286a79ec1bb615188518a167a73429b424'

#### Q2.c)

Now implement a function that filters the stopwords. We will use NLTK's built-in English stopword list.

In [24]:
stopword_list = stopwords.words('english')

In [25]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
bla=[]
i=2
for word in data_tok_lc:
    print(word)
    i-=1
    bla.append(word)
    if i==0:
        break


i ' m the keeper of the stats for a family hockey pool and i ' m looking for daily / weekly email servers for playoff stats . i ' ve connected with the servers at j . militzok @ skidmore . edu and wilson @ cs . ucf . edu . i ' m still sorting these two out . are there others ? email please as my site doesn ' t get this group . thanks . rick


In [27]:
new =bla[1]

print([word for word in tokenizer.tokenize(new) if word not in stopword_list])


["'", 'keeper', 'stats', 'family', 'hockey', 'pool', "'", 'looking', 'daily', '/', 'weekly', 'email', 'servers', 'playoff', 'stats', '.', "'", 'connected', 'servers', 'j', '.', 'militzok', '@', 'skidmore', '.', 'edu', 'wilson', '@', 'cs', '.', 'ucf', '.', 'edu', '.', "'", 'still', 'sorting', 'two', '.', 'others', '?', 'email', 'please', 'site', "'", 'get', 'group', '.', 'thanks', '.', 'rick']


In [28]:
def apply_filter_stopwords(data, stopword_list):
    """
    Returns a list of strings, where the strings do not contain any of
        the stopwords in the given list.
    
    Args:
    data - list of strings to filter stopwords from
    stopword_list - list of stopwords to filter out
    """
    data_no_stopwords=[]
    for words in data :
        removed = [word for word in tokenizer.tokenize(words) if word not in stopword_list]
        strings = ' '.join(removed)
        data_no_stopwords.append(strings)
     
    return data_no_stopwords


In [29]:
data_tok_lc_nosw = apply_filter_stopwords(data_tok_lc, stopword_list)

assert len(data_tok_lc_nosw) == 1594
assert isinstance (data_tok, list)
assert all(isinstance(sentence, str) for sentence in data_tok)

assert len([w for s in data_tok_lc_nosw for w in s.split(" ")]) == 239401
assert hashlib.sha256(data_tok_lc_nosw[1234].encode()).hexdigest() == \
    '9d48d50a7a0dcd3676c13419750a3aad75006e165d2c614e0fe7e8f98d521b84'
assert hashlib.sha256(data_tok_lc_nosw[567].encode()).hexdigest() == \
    'd1d745ba13b4588eb2356dd3a5119113e391bfe226bc4126338391bf03830ecf'

#### Q2.d)

After filtering stopwords, we want to remove punctuation from the text as well. Make use of `string.punctuation` to do so. Make sure to remove all punctuation and not only tokens that are single punctuation characters. 

_Hint: check the note on punctuation in Part 2 of the learning notebooks_

In [30]:
def apply_filter_punct(data):
    """
    Returns a list of tokenized sentences with no punctuation.
    
    Args:
    data - list of tokenized sentences from which to remove punctuation
    """

    data_tok_lc_nosw_nopunct=[]
    for words in data :
        removed = [word for word in words if word not in string.punctuation]
        
        strings = ''.join(removed)
        
        data_tok_lc_nosw_nopunct.append(strings)
     
    return data_tok_lc_nosw_nopunct


In [31]:
data_tok_lc_nosw_nopunct = apply_filter_punct(data_tok_lc_nosw)


#### Normalize whitespaces

Run the following function on `data_tok_lc_nosw_nopunct` before checking your answers, in case extra whitespaces cause the asserts to fail:

In [32]:
def normalize_whitespace(data):
    return [re.sub(r"^\s+|\s+$|(?<=\s)\s*", "", text) for text in data]

data_tok_lc_nosw_nopunct_norm = normalize_whitespace(data_tok_lc_nosw_nopunct)

In [33]:
assert len(data_tok_lc_nosw_nopunct_norm) == 1594
assert len([w for s in data_tok_lc_nosw_nopunct_norm for w in s.split(" ")]) == 174486
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm[1234].encode()).hexdigest() == \
    '57b0c8646701140d4edfb7112b812c0df7cdebfe7db1a58811c9229f906f6ca9'
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm[567].encode()).hexdigest() == \
    'a39ba5c27525d163573d484284aff0228118db129aeae1623020ac11b063a952'

#### Q2.e)

The last preprocessing step you are going to implement is stemming. Implement the function to receive an NLTK-style stemmer and return the text as a string with the stemmer applied.

In [34]:
def apply_stemmer(data, stemmer):
    """
    Returns a list of strings, with stemmed data.
    
    Args:
    data - list with text to stem
    stemmer - instance of stemmer to use
    """

    # YOUR CODE HERE
    raise NotImplementedError()

In [35]:
stemmer = SnowballStemmer("english")
data_tok_lc_nosw_nopunct_norm_stem = apply_stemmer(data_tok_lc_nosw_nopunct_norm, stemmer)

assert len(data_tok_lc_nosw_nopunct_norm_stem) == 1594
assert len([w for s in data_tok_lc_nosw_nopunct_norm_stem for w in s.split(" ")]) == 174486
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm_stem[1234].encode()).hexdigest() == \
    'f8e83239a3658073219232f17c270a0df20d4cebfb517ce935d395e02467009f'
assert hashlib.sha256(data_tok_lc_nosw_nopunct_norm_stem[567].encode()).hexdigest() == \
    '4f36be21767ee2ad747baecf0d67b8b082c8c65f334fce896036870a75570fb0'

NotImplementedError: 

#### Q2.f)

Finally, join everything in a function, that applies the steps in the following order:
* Tokenization
* Lowercasing
* Filtering stopwords
* Filtering punctuation
* Normalizing whitespace
* Stemming

Make use of the functions you designed above when filling the transformer below.


In [None]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, lower=True, remove_punct=True, stopwords=[], stemmer=None):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.lower = lower
        self.remove_punct = remove_punct
        self.stopwords = stopwords
    
    def clean_sentences(self, data):
                
        # Tokenize sentence so it each sentence contains spaced words or tokens
        # sentences_preprocessed = ...
        # YOUR CODE HERE
        raise NotImplementedError()
        
        # Lowercase
        if self.lower:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()

        if self.stopwords:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()
            
        # Remove punctuation
        if self.remove_punct:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()
        
        # Normalize whitespace
        # sentences_preprocessed = ...
        # YOUR CODE HERE
        raise NotImplementedError()
    
        # Stem words
        if self.stemmer:
            # sentences_preprocessed = ...
            # YOUR CODE HERE
            raise NotImplementedError()

        return sentences_preprocessed


In [None]:
text_cleaner = TextCleanerTransformer(
    WordPunctTokenizer(),
    lower=True, 
    remove_punct=True, 
    stopwords=stopwords.words('english'),
    stemmer=SnowballStemmer("english"),
)

X_train_pre = text_cleaner.clean_sentences(X_train)

In [None]:
assert len(X_train_pre) == 1594
assert len([w for s in X_train_pre for w in s.split(" ")]) == 174486
assert X_train_pre[1234] == ("mcgwire carter see justif bond thoma tend higher bat averag major differ "
    "see mcgwire carter carter draw walk pitcher afraid throw strike carter")
assert X_train_pre[567] == ("best one saw last year willi mcgee matthew think philli fierc line "
    "drive still rise hit second deck facad vet willi mcgee one homerun last year")

## Q3. Text classification

We will now use what we've learned to try to classify the topic of these articles as baseball or hockey. Let's first load the preprocessed data (slightly different from the answer to Q2) and double-check the balance of the classes:

In [None]:
def load_dataset(file_name):
    """
    Loads a tsv file with the label in the first column and the text in the second column.
    Returns two lists, one containing only the text and one containing the labels
    
    Args:
    file_name: path to input file
    """
    labels = []
    texts = []
    with open(file_name, 'r', encoding='utf-8') as f:
        for line in f:
            label, text = line.strip().split("\t")
            labels.append(int(label))
            texts.append(text)
    return texts, labels

In [None]:
X_train_pre, y_train = load_dataset('data/sports_train_preprocessed.tsv')
X_dev_pre, y_dev = load_dataset('data/sports_dev_preprocessed.tsv')
X_test_pre, y_test = load_dataset('data/sports_test_preprocessed.tsv')

In [None]:
get_data_stats(X_train_pre, y_train)

In [None]:
get_data_stats(X_dev_pre, y_dev)

So, we should be aiming for much better than 53% accuracy, which is what we would get if we naively predicted `1` (hockey) for everything.

#### Q3.a)

First, we'll look at the top X ngrams in each category to see if anything is interesting. Write a function that returns the most common n-grams and their count for a specific label in our dataset.


In [None]:
def top_ngrams_for_category(text, labels, filter_label, top_n=10, ngram_size=1):
    """
    Filters the data to the desired label, constructs a counter of ngrams
    Returns the top n ngrams
    
    Args:
    text: list of text strings to get ngrams from
    labels: categories corresponding to text
    filter_label: the label to filter the data on before getting ngrams
    top_n: top n ngrams to return
    ngram_size: the "n" in ngram (e.g. if ngram_size=2, return only bigrams)
    """
    # First, filter text to desired category
    # text_filtered = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Create list of ngrams
    # ngram_list = ...
    # hint: make the ngrams in the list an immutable data type so they can be used as dict keys later,
    # tuples for example
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Count occurances of each ngram
    # hint: use collections.Counter
    # ngram_counter = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # return top_n most common ngrams
    # return ...
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
top_10_unigrams_baseball = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=10, ngram_size=1)
assert top_10_unigrams_baseball == [(('0',), 1899),
                                     (('1',), 718),
                                     (('game',), 561),
                                     (('year',), 505),
                                     (('2',), 481),
                                     (('3',), 453),
                                     (('5',), 408),
                                     (('would',), 383),
                                     (('4',), 342),
                                     (('one',), 311)]
top_7_bigrams_baseball = top_ngrams_for_category(X_train_pre, y_train, 0, top_n=7, ngram_size=2)
assert top_7_bigrams_baseball == [(('0', '0'), 240),
                                 (('last', 'year'), 118),
                                 (('1', '0'), 94),
                                 (('0', '1'), 72),
                                 (('new', 'york'), 66),
                                 (('00', '00'), 66),
                                 (('1', '2'), 61)]
top_10_unigrams_hockey = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=10, ngram_size=1)
assert top_10_unigrams_hockey == [(('0',), 5104),
                                     (('1',), 3657),
                                     (('2',), 2571),
                                     (('3',), 1804),
                                     (('4',), 1569),
                                     (('6',), 1159),
                                     (('5',), 1135),
                                     (('7',), 989),
                                     (('game',), 972),
                                     (('team',), 743)]
top_5_trigrams_hockey = top_ngrams_for_category(X_train_pre, y_train, 1, top_n=5, ngram_size=3)
assert top_5_trigrams_hockey == [(('0', '0', '0'), 691),
                                 (('0', '1', '1'), 422),
                                 (('1', '0', '1'), 303),
                                 (('1', '0', '0'), 184),
                                 (('1', '1', '0'), 171)]

Looking at the top ngrams for each category, it doesn't seem like a BoW model will be very interesting, but let's try anyway.

#### Q3.b)
To begin, let's streamline our pipeline in a nice function. We'll use sklearn's `CountVectorizer` instead of the function we wrote. We'll also use the `MultinomialNB` classifier to make predictions on the dev set.

In [None]:
def train_and_validate(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed articles in training data
    X_dev - preprocessed articles in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    """
    
    # Build the pipeline containing the countvectorizer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    
    # acc = (...)
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # print the classification report
    print(classification_report(y_dev, y_dev_pred))

    # return text_clf, y_dev_pred, acc
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev)

# check same as before
assert_allclose(clf['clf'].intercept_, np.array([-0.68084531]), rtol=1e-3)
assert ' '.join(str(i) for i in y_dev_pred[:20]) == "0 1 0 0 1 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0"
assert hashlib.sha256(' '.join(str(i) for i in y_dev_pred).encode()).hexdigest() == \
    "5e67c5da0e5fc28a5d834ee9f12b93bc7c20bd26347fc9f02a36d50bc000b573"
assert_allclose(acc, 0.91, rtol=1e-2)

In [None]:
# we should also look at some misclassified examples
for text, pred, true in zip(X_dev_pre[:50], y_dev_pred[:50], y_dev[:50]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

So just with the simplest BoW model we already get an accuracy of 0.91! But let's see if we can do even better... In the misclassified examples, the last one even contains the word "hockey" but was misclassified. And slightly tricker, but the first example contains the team name "Anaheim Ducks," which is an NHL team. We should be able to get those right.

#### Q3.c)
Run the pipeline for different ngram ranges and/or with different values for max_features, until you get an accuracy of at least 94%.

In [None]:
# clf,y_dev_pred, acc = train_and_validate(...)
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert(acc >= 0.94)

Now evaluate your model on the test set!

In [None]:
X_test_vec = clf['vect'].transform(X_test_pre)
y_test_pred = clf['clf'].predict(X_test_vec)
print(classification_report(y_test, y_test_pred))

In [None]:
# look at some misclassified examples again
for text, pred, true in zip(X_dev_pre[:50], y_dev_pred[:50], y_dev[:50]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Depending on your chosen hyperparameters, the 2 examples we missed earlier that we should have gotten should be correct now! But let's see if we can get even better performance now by using the relative importance of ngrams with TF-IDF.

## Q4. TF-IDF

Similarly to how we found the top ngrams before we started working with BoW, we will now find the most important unigrams, inverse weighted by document frequency.

**Note**: Throughout this exercise, we'll use this term - **most important** - to refer to the unigrams of highest weight, in particular within each class, since these will be transfered into vectors used for training. It is not 100% true these features will necessarily be "the most importnant", but in general they will usually be relevant, especially if your dataset is well processed, so we'll use this expression as a proxy.

#### Q4.a)

First, implement TF-IDF on a dataframe representing a Bag of Words model of the data. Here is a reminder of the TF-IDF formula:

$$ tfidf _{t, d} =(log{(1 + tf_{t,d})})*(log{(1 + \frac{N}{df_{t}})})  $$


In [None]:
# We'll start you off with the BoW representation, in pandas dataframe format
vec = CountVectorizer()
BoW_train = vec.fit_transform(X_train_pre)
BoW_train_df = pd.DataFrame(BoW_train.todense())

In [None]:
def tfidf(BoW_df):
    """
    Returns pandas dataframe of a tfidf representation from a BoW representation dataframe.

    Args:
    BoW_df - dataframe with document word counts (Bag of Words)
    """
    # remember that the BoW representation is raw counts, it is not normalized by the length of each text
    # first transform the df into term frequencies, where the counts are normalized
    # also double check the formula above for additional transformations applied to the tf expression
    # use np.log(x) for natural base log
    # tf = (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    # now we need a function that computes the idf side of the expression
    # it operates over a column (a word in the vocab), where the column contains each doc's count of that word
    # def _idf(column):
    #   return (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    # now weight the term frequencies by the idfs
    # tf_idf = (...)
    # YOUR CODE HERE
    raise NotImplementedError()

    return tf_idf

In [None]:
tfidf_df = tfidf(BoW_train_df)
assert math.isclose(tfidf_df[12609][2], 0.0587996, abs_tol=0.0001)
assert math.isclose(tfidf_df[0][1531], 0.0174387, abs_tol=0.0001)
assert math.isclose(tfidf_df[8][6], 0.0092900, abs_tol=0.0001)

### Q4.b)

Now that we have our TF-IDF representation, we can proceed with getting the most important words per category. 

Let's write a small helper function first to get the vocabulary in the right format.

In [None]:
# get the vocab from CountVectorizer, which is of the format {"word": idx, ...}
vocab_word_2_idx = vec.vocabulary_

In [None]:
# write a function to convert this vocab to the format {idx: "word", ...}

def reverse_vocab(vocab_word_to_index):
    """
    Converts a vocabulary dictionary with words as keys and indices as values to a 
        new dictionary with indices as keys and words as values
    
    Args:
    vocab_word_to_index: vocabulary dict of the format {"word": 0, "hello": 1, ...}
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
vocab_idx_2_word = reverse_vocab(vocab_word_2_idx)
assert len(vocab_idx_2_word) == 12613
assert vocab_idx_2_word[11714] == "two"
assert vocab_idx_2_word[2847] == "boston"
assert vocab_idx_2_word[8762] == "palmer"

### Q4.c)

Finally, write a function to return the list of N most important words in a single category according to TF-IDF.

In [None]:
def top_tfidf_words_for_category(tfidf_df, labels, filter_label, vocabulary, top_n=10):
    """
    Returns the top n most important words for the given label, with the given vocabulary
    and corresponding tfidf representation of some text data
    
    Args:
    tfidf_df: a dataframe of the tfidf representation of some text (columns=words, rows=documents)
    labels: categories corresponding to documents in tfidf_df
    filter_label: the label to filter the data on before getting top n words
    vocabulary: a dict of the format {idx: "word", ...}
    top_n: top n words to return
    """
    # First, filter tfidf to desired category
    # tfidf_filt = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Get the top n words of the current label, according to tfidf
    # There are several ways to do this, but here are some hints
    # 1) Sum the filtered df to get the total value per word
    # 2) Sort
    # 3) Replace indices with words and return the top_n
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
top_15_hockey = top_tfidf_words_for_category(tfidf_df, y_train, 1, vocab_idx_2_word, top_n=15)
assert top_15_hockey == ['game',
     'team',
     'hockey',
     'play',
     'would',
     'player',
     'playoff',
     'go',
     'one',
     'year',
     'nhl',
     'get',
     'espn',
     'like',
     'goal']
top_20_baseball = top_tfidf_words_for_category(tfidf_df, y_train, 0, vocab_idx_2_word, top_n=20)
assert top_20_baseball == ['game',
     'year',
     'basebal',
     'pitch',
     'hit',
     'run',
     'would',
     'think',
     'pitcher',
     'one',
     'know',
     'cub',
     'day',
     'like',
     'first',
     'time',
     'anyon',
     'go',
     'team',
     'get']

As you can see, these top words per category make a lot more sense than the ones from just BoW. Maybe this will help us make better predictions!

### Q4.d)

Now, we will put everything together. Rewrite the train_and_validate function from Q3.b but using sklearn's `TfIdfTransformer`. Also, add kwargs for CountVectorizer's `max_df` and `min_df`.

In [None]:
def train_and_validate_with_tfidf(X_train, X_dev, y_train, y_dev, ngram_range=(1,1),
                                  max_features=None, max_df=1.0, min_df=1):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed articles in training data
    X_dev - preprocessed articles in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    max_df = max_df for CountVectorizer (int or float)
    min_df = min_df for CountVectorizer (int or float)
    """
    
    # Build a pipeline containing the countvectorizer, tfidftransformer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    # print the classification report
    # acc = (...)
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    print(classification_report(y_dev, y_dev_pred))

    # return text_clf, y_dev_pred, acc
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
clf, y_dev_pred, acc = train_and_validate_with_tfidf(X_train_pre, X_dev_pre, y_train, y_dev)

assert hashlib.sha256(" ".join([str(x) for x in list(y_dev_pred)]).encode()).hexdigest() == \
    "1beda6d3a1226853b58518db5f7fcba362722f1f97b53f0af5c71c53fc28f686"
assert_allclose(acc, 0.93367, rtol=1e-3)

In [None]:
# As before, we should also look at some misclassified examples
for text, pred, true in zip(X_dev_pre[:50], y_dev_pred[:50], y_dev[:50]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Since "outfield" is clearly a baseball word, let's keep going and see if we can do even better.

#### Q4.e)

Use the `train_and_validate_with_tfidf` function you created before to train with different hyperparameters and get an accuracy score above 94% on the validation dataset. (This threshold is the same as what we got for plain CountVectorizer)


In [None]:
# clf, _, acc = train_and_validate_with_tfidf(...)
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert(acc > 0.94)

Now evaluate your model on the test set!

In [None]:
X_test_vec = clf['tfidf'].transform(clf['vect'].transform(X_test_pre))
y_test_pred = clf['clf'].predict(X_test_vec)
print(classification_report(y_test, y_test_pred))

We ended up not being able to beat our baseline of BoW with TF-IDF, maybe because the dataset is small and very easy, and so a simple algorithm was enough. Still, in general, it's good to try TF-IDF for text classification tasks and have an understanding of how your results change with different hyperparameters!