In [33]:
import csv
import pandas as pd
import random
import spacy
import nltk

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']


read csv and concat title and content

In [34]:
dataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    dataset = pd.concat([dataset, df[['text', 'category']]])

#shuffle row
dataset=dataset.sample(frac=1).reset_index(drop=True)
print(dataset)

                                                   text   category
0       This week in Bidenomics: The rich will survive    business
1     Today my son asked my wife to legally adopt hi...  parenting
2     Fiji has won gold in Rugby Sevens - Men’s at T...     sports
3     Trump Ordered Staff to 'Bust Some Heads' of Bl...   politics
4                    Fell in love with Porto, Portugal      travel
...                                                 ...        ...
5936  Lawmakers Talk Next Steps For Marijuana Bankin...   business
5937  Norwegian women's beach-handball team forced t...     sports
5938  Noam Chomsky: ‘Republican Party has drifted of...   politics
5939  How employers steal billions of dollars from w...   business
5940  Don’t hold back because of kids.. take them on...     travel

[5941 rows x 2 columns]


preprocessing

In [35]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

#make all characters to lower case
dataset['text']=dataset['text'].apply(lambda x: str.lower(x))

#word tokenization
dataset['text'] = dataset['text'].apply(lambda x: word_tokenize(x))

#delete stopwords and punctuation
stopwordList = set(stopwords.words('english') + list(string.punctuation))
dataset['text'] = dataset['text'].apply(lambda x: list(word for word in x if word.isalpha()))

print(dataset)

                                                   text   category
0     [this, week, in, bidenomics, the, rich, will, ...   business
1     [today, my, son, asked, my, wife, to, legally,...  parenting
2     [fiji, has, won, gold, in, rugby, sevens, men,...     sports
3     [trump, ordered, staff, to, some, heads, of, b...   politics
4               [fell, in, love, with, porto, portugal]     travel
...                                                 ...        ...
5936  [lawmakers, talk, next, steps, for, marijuana,...   business
5937  [norwegian, women, team, forced, to, pay, fine...     sports
5938  [noam, chomsky, republican, party, has, drifte...   politics
5939  [how, employers, steal, billions, of, dollars,...   business
5940  [don, t, hold, back, because, of, kids, take, ...     travel

[5941 rows x 2 columns]


make bag of words column

In [48]:
from collections import defaultdict
from nltk.probability import FreqDist

allTextList = sum(dataset['text'].tolist(),[])
numOfdict = len(set(allTextList))
print('the number of word dict: ',numOfdict)


the number of word dict:  17395
0       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
1       [5, 3, 0, 1, 0, 7, 2, 0, 3, 2, 1, 1, 3, 0, 1, ...
2       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
5936    [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
5937    [0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
5938    [0, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...
5939    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
5940    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bagOfWords, Length: 5941, dtype: object


In [73]:
#the number of words to use for training
numOfFeatureWords = numOfdict-4000


fdist = FreqDist(allTextList)
wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))
def bagOfWords(tokens):
    d = defaultdict(int,{ word:0 for word in wordDict })
    for token in tokens:
        d[token]+=1
    ret = []
    for key, val in d.items():
        ret.append(val)
    return ret[:numOfFeatureWords]

dataset['bagOfWords'] = dataset['text'].apply(lambda x: bagOfWords(x))

print(dataset['bagOfWords'])

0       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
1       [5, 3, 0, 1, 0, 7, 2, 0, 3, 2, 1, 1, 3, 0, 1, ...
2       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
5936    [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
5937    [0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
5938    [0, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...
5939    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
5940    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bagOfWords, Length: 5941, dtype: object


add column of one hot encoding for category.

In [74]:
dataset['labels'] = dataset['category'].apply(lambda x: dict(defaultdict(bool,{ category: category == x for category in categories })))
print(dataset['labels'])

0       {'business': True, 'entertainment': False, 'pa...
1       {'business': False, 'entertainment': False, 'p...
2       {'business': False, 'entertainment': False, 'p...
3       {'business': False, 'entertainment': False, 'p...
4       {'business': False, 'entertainment': False, 'p...
                              ...                        
5936    {'business': True, 'entertainment': False, 'pa...
5937    {'business': False, 'entertainment': False, 'p...
5938    {'business': False, 'entertainment': False, 'p...
5939    {'business': True, 'entertainment': False, 'pa...
5940    {'business': False, 'entertainment': False, 'p...
Name: labels, Length: 5941, dtype: object


divide trainset and testset

In [75]:
trainSplit = 0.7
testSplit = 0.3

datasetSize = dataset.shape[0]

trainSize = int(datasetSize * trainSplit)
testSize = int(datasetSize * testSplit)

print('trainSize: ', trainSize)
print('testSize: ', testSize)

trainSet = dataset.iloc[:trainSize, :]
testSet = dataset.iloc[trainSize:, :]

print(trainSet.shape[0])
print(testSet.shape[0])

trainSize:  4158
testSize:  1782
4158
1783


build a Bag of Words model

In [76]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
classifier = GaussianNB()
trainX = np.array(trainSet['bagOfWords'].tolist())
trainY = np.array(trainSet['category'].tolist())
print(trainX.shape, trainY.shape)
classifier.fit(trainX, trainY)

(4158, 13395) (4158,)


GaussianNB()

In [77]:
from sklearn.metrics import accuracy_score

testX = np.array(testSet['bagOfWords'].tolist())
testY = np.array(testSet['category'].tolist())
print(testX.shape, testY.shape)
predY = classifier.predict(testX)
accuracy = accuracy_score(testY, predY)
print('accuracy :', accuracy)

(1783, 13395) (1783,)
accuracy : 0.7885586090858104
