In [299]:
import csv
import pandas as pd
import random
import spacy
import nltk

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']


read csv and concat title and content

In [300]:
dataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    dataset = pd.concat([dataset, df[['text', 'category']]])

#shuffle row
dataset=dataset.sample(frac=1).reset_index(drop=True)
print(dataset)

                                                   text       category
0     Fox News is wrong, Star Trek has always been "...  entertainment
1     Chris Cuomo fired after CNN learned of alleged...  entertainment
2      Judge Terminates Britney Spears Conservatorship   entertainment
3     Poll finds ‘Jeopardy!’ fans overwhelmingly wan...  entertainment
4     More Republicans have died of covid-19. Does t...       politics
...                                                 ...            ...
2035  Republicans warn Justice Department probe of T...       politics
2036  South Park Mocks Vladimir Putin and Addresses ...  entertainment
2037  Ryan Reynolds Taking A Break From Acting After...  entertainment
2038  Quentin Tarantino pitches Rambo: First Blood r...  entertainment
2039  Why Can’t Democrats Make GOP Extremism a Campa...       politics

[2040 rows x 2 columns]


preprocessing

In [301]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

#make all characters to lower case
dataset['text']=dataset['text'].apply(lambda x: str.lower(x))

#word tokenization
dataset['text'] = dataset['text'].apply(lambda x: word_tokenize(x))

#delete stopwords and punctuation
stopwordList = set(stopwords.words('english') + list(string.punctuation))
dataset['text'] = dataset['text'].apply(lambda x: list(word for word in x if word.isalpha()))

print(dataset)

                                                   text       category
0     [fox, news, is, wrong, star, trek, has, always...  entertainment
1     [chris, cuomo, fired, after, cnn, learned, of,...  entertainment
2     [judge, terminates, britney, spears, conservat...  entertainment
3     [poll, finds, jeopardy, fans, overwhelmingly, ...  entertainment
4     [more, republicans, have, died, of, does, that...       politics
...                                                 ...            ...
2035  [republicans, warn, justice, department, probe...       politics
2036  [south, park, mocks, vladimir, putin, and, add...  entertainment
2037  [ryan, reynolds, taking, a, break, from, actin...  entertainment
2038  [quentin, tarantino, pitches, rambo, first, bl...  entertainment
2039  [why, can, t, democrats, make, gop, extremism,...       politics

[2040 rows x 2 columns]


make bag of words column

In [302]:
from collections import defaultdict
from nltk.probability import FreqDist

#the number of words to use for training
numOfFeatureWords = 1000

allTextList = sum(dataset['text'].tolist(),[])
fdist = FreqDist(allTextList)
wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))
def bagOfWords(tokens):
    d = defaultdict(int,{ word:0 for word in wordDict })
    for token in tokens:
        d[token]+=1
    ret = []
    for key, val in d.items():
        ret.append(val)
    return ret[:numOfFeatureWords]

dataset['bagOfWords'] = dataset['text'].apply(lambda x: bagOfWords(x))

print(dataset['bagOfWords'])


0       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
2035    [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
2036    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2037    [2, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, ...
2038    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
2039    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bagOfWords, Length: 2040, dtype: object


add column of one hot encoding for category.

In [303]:
dataset['labels'] = dataset['category'].apply(lambda x: dict(defaultdict(bool,{ category: category == x for category in categories })))
print(dataset['labels'])

0       {'business': False, 'entertainment': True, 'pa...
1       {'business': False, 'entertainment': True, 'pa...
2       {'business': False, 'entertainment': True, 'pa...
3       {'business': False, 'entertainment': True, 'pa...
4       {'business': False, 'entertainment': False, 'p...
                              ...                        
2035    {'business': False, 'entertainment': False, 'p...
2036    {'business': False, 'entertainment': True, 'pa...
2037    {'business': False, 'entertainment': True, 'pa...
2038    {'business': False, 'entertainment': True, 'pa...
2039    {'business': False, 'entertainment': False, 'p...
Name: labels, Length: 2040, dtype: object


divide trainset and testset

In [304]:
trainSplit = 0.7
testSplit = 0.3

datasetSize = dataset.shape[0]

trainSize = int(datasetSize * trainSplit)
testSize = int(datasetSize * testSplit)

print('trainSize: ', trainSize)
print('testSize: ', testSize)

trainSet = dataset.iloc[:trainSize, :]
testSet = dataset.iloc[trainSize:, :]

print(trainSet.shape[0])
print(testSet.shape[0])

trainSize:  1428
testSize:  612
1428
612


build a Bag of Words model

In [305]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
classifier = GaussianNB()
trainX = np.array(trainSet['bagOfWords'].tolist())
trainY = np.array(trainSet['category'].tolist())
print(trainX.shape, trainY.shape)
classifier.fit(trainX, trainY)

(1428, 1000) (1428,)


GaussianNB()

In [307]:
from sklearn.metrics import accuracy_score

testX = np.array(testSet['bagOfWords'].tolist())
testY = np.array(testSet['category'].tolist())
print(testX.shape, testY.shape)
predY = classifier.predict(testX)
accuracy = accuracy_score(testY, predY)
print(accuracy)

(612, 1000) (612,)
['politics' 'politics' 'politics' 'politics' 'entertainment' 'politics'
 'politics' 'entertainment' 'politics' 'politics' 'politics'
 'entertainment' 'politics' 'politics' 'politics' 'entertainment'
 'entertainment' 'politics' 'politics' 'entertainment' 'politics'
 'entertainment' 'politics' 'entertainment' 'entertainment' 'politics'
 'politics' 'politics' 'politics' 'entertainment' 'politics'
 'entertainment' 'entertainment' 'entertainment' 'politics' 'politics'
 'politics' 'entertainment' 'politics' 'parenting' 'politics'
 'entertainment' 'politics' 'politics' 'entertainment' 'entertainment'
 'politics' 'politics' 'politics' 'politics' 'politics' 'politics'
 'politics' 'politics' 'politics' 'entertainment' 'politics'
 'entertainment' 'entertainment' 'entertainment' 'entertainment'
 'politics' 'entertainment' 'entertainment' 'politics' 'politics'
 'entertainment' 'entertainment' 'entertainment' 'entertainment'
 'entertainment' 'entertainment' 'entertainment' 'politi