In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.read_csv('data/NYT_data_cleaned.csv')

In [3]:
del df['Unnamed: 0']

In [4]:
y_vals = df.section

In [5]:
x_df = df[['id', 'headline', 'snippet']]
x_df.head()

Unnamed: 0,id,headline,snippet
0,59f89f9895d0e0246f213289,Cynthia Nixon to Host the National Book Awards,Cynthia Nixon's new role will be on the bookis...
1,59f89eb195d0e0246f213282,Borghese Gallery Gathers a Full House of Berni...,The most comprehensive exhibition of the Baroq...
2,59f89c5795d0e0246f213277,‘Stranger Things 2’: Pixar’s Andrew Stanton on...,The director of “Finding Nemo” and “Wall-E” di...
3,59f8972e95d0e0246f213265,Judge Accidentally Reveals Winner of Hot Bakin...,"A judge on the Great British Bake Off, a widel..."
4,59f893bb95d0e0246f21325a,2 Best Friends in a Charming Aussie Comedy Abo...,It's time to make two new friends: Celia Pacqu...


### Clean up data: Remove all punctutation
##### Note: This goes a little too far and makes "it's" become "its". Couldn't think of a good workaround

In [6]:
def remove_punctuation(string):
    if not (type(string) is str):
        return ''
    string = string.lower()
    exclude = set('!@#$%^&*()-=_+1234567890{}[]|\:;"<,>./?`~\'’')
    return ''.join(ch for ch in string if ch not in exclude)
headline = list(map(remove_punctuation, df.headline))
snippet = list(map(remove_punctuation, df.snippet))

### Combining 'headline' and 'snippet' data

In [7]:
tokenized_words = map(lambda x, y: set(x.split() + y.split()), 
                                  headline, snippet)
tokenized_words = list(tokenized_words)

In [8]:
print(tokenized_words[4])

{'about', 'in', 'luke', 'friendship', 'two', 'a', 'its', 'charming', 'to', 'new', 'pacquola', 'comedy', 'time', 'celia', 'mcgregor', 'best', 'make', 'and', 'aussie', 'friends'}


In [9]:
x_df['tokenized_words'] = tokenized_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
x_df.head()

Unnamed: 0,id,headline,snippet,tokenized_words
0,59f89f9895d0e0246f213289,Cynthia Nixon to Host the National Book Awards,Cynthia Nixon's new role will be on the bookis...,"{on, the, side, nixons, role, awards, to, cynt..."
1,59f89eb195d0e0246f213282,Borghese Gallery Gathers a Full House of Berni...,The most comprehensive exhibition of the Baroq...,"{exhibition, around, borghese, of, maestro, mo..."
2,59f89c5795d0e0246f213277,‘Stranger Things 2’: Pixar’s Andrew Stanton on...,The director of “Finding Nemo” and “Wall-E” di...,"{project, pixars, discussed, ‘stranger, being,..."
3,59f8972e95d0e0246f213265,Judge Accidentally Reveals Winner of Hot Bakin...,"A judge on the Great British Bake Off, a widel...","{forced, after, episode, revealing, competitio..."
4,59f893bb95d0e0246f21325a,2 Best Friends in a Charming Aussie Comedy Abo...,It's time to make two new friends: Celia Pacqu...,"{about, in, luke, friendship, two, a, its, cha..."


### Mapping each word to a number and then using that number to be an entry in an NxM matrix
Where N is the number of observations and M is the number of times that word is present in that column

In [83]:
unique_words = set()
for words in x_df.tokenized_words:
    unique_words.update(words)

In [84]:
word_to_index_map = {}
index_to_word_map = []
for i, word in enumerate(unique_words):
    word_to_index_map[word] = i
    index_to_word_map.append(word)

In [85]:
x_df.shape

(16990, 4)

In [86]:
x_arr = np.zeros((x_df.shape[0], len(unique_words)))

In [87]:
for row, tokens in enumerate(x_df.tokenized_words):
    for token in tokens:
        col = word_to_index_map[token]
        x_arr[row, col] += 1

In [88]:
y_vec = np.zeros((len(y_vals)))

In [17]:
categories = ['Arts', 'Business', 'Obituaries', 'Sports', 'World']
def cat_to_number(cat):
    return categories.index(cat)
y_vec = np.array(list(map(cat_to_number, y_vals)) ) 

In [18]:
y_vec

array([0, 0, 0, ..., 4, 4, 4])

In [19]:
x_train = np.random.choice(x_df.shape[0], x_df.shape[0] // 2, replace = False)

In [20]:
x_test = [x for x in range(x_df.shape[0]) if x not in x_train]

In [21]:
print(x_arr.shape)
print(x_df.shape)
word_count_by_class = np.zeros((5, x_arr.shape[1]))
print(word_count_by_class.shape)
print(len(x_train))

(16990, 30926)
(16990, 4)
(5, 30926)
8495


In [70]:
train_set=x_train
alpha=3
beta=3

num_features = x_arr.shape[1]
num_obs = len(train_set)
num_categories = 5

obs_count_by_class = np.zeros((num_categories, 1))
for i in train_set:
    obs_count_by_class[y_vec[i], 0] += 1
print(obs_count_by_class)
theta_c = [count / len(train_set) for count in obs_count_by_class[:,0]]

obs_count_by_class = obs_count_by_class + alpha + beta - 2

word_count_by_class = np.zeros((num_categories, num_features))
print(word_count_by_class.shape)
for ix, i in enumerate(train_set):
    word_count_by_class[y_vec[i], :] += x_arr[i, :]
word_count_by_class = word_count_by_class + alpha - 1
        
theta_jc = np.zeros(word_count_by_class.shape)
obs_count_by_class = np.tile(obs_count_by_class, (1, num_features))
theta_jc = np.divide(word_count_by_class, obs_count_by_class)


[[ 1455.]
 [ 1745.]
 [ 2017.]
 [ 1816.]
 [ 1462.]]
(5, 30926)


In [71]:
print(theta_c)

[0.17127722189523248, 0.20541494997057091, 0.23743378457916423, 0.21377280753384345, 0.17210123602118893]


In [73]:
w_jc = np.zeros(theta_jc.shape)
w_zeroc = np.zeros([1,num_categories])
for i in range(num_categories):
    for j in range(num_features):
        w_jc[i, j] = math.log((theta_jc[i, j] * (1 - theta_jc[0, j])) / (theta_jc[0, j]*(1 - theta_jc[i, j])))
        w_zeroc[0, i] += math.log((1 - theta_jc[i, j]) / (1 - 
                        theta_jc[0,j])) + math.log(theta_c[i] / theta_c[0])



In [81]:
print(w_jc.shape)
print(w_zeroc)

(5, 30926)
[[     0.           5622.1002104   10106.96920755   6859.9363636
     142.39831563]]


In [77]:
def predict(w_jc, w_zeroc, x_arr, y_vec, test_set):
    num_categories = 5
    num_features = x_arr.shape[1]
    predictions = np.zeros((num_categories, len(test_set)))
    for i,obs in enumerate(test_set):
        for cat in range(num_categories):
            predictions[cat, i] += np.sum(w_jc[cat, :]*x_arr[obs, :])
            predictions[cat,i] += w_zeroc[0, cat]
    return predictions

predictions = predict(w_jc,w_zeroc, x_arr, y_vec, x_test)

correct = 0
s = set()
for i, test_obs in enumerate(x_test):
    s.add(np.argmax(predictions[: , i]))
    correct += np.argmax(predictions[: , i]) == y_vec[test_obs]
print(s)

{2}


In [78]:
print(predictions[])

[[     0.              0.              0.         ...,      0.              0.
       0.        ]
 [  5612.29681142   5618.14707852   5612.67352068 ...,   5611.27105936
    5638.56216011   5623.14906878]
 [ 10097.55072354  10097.78839271  10105.33155527 ...,  10099.30836788
   10106.03710634  10113.27915004]
 [  6849.98924013   6854.39310685   6854.49894121 ...,   6847.99172512
    6862.6889999    6856.65044487]
 [   134.16022741    146.02413237    133.94002363 ...,    143.57414038
     179.69790061    168.44941801]]
