In [122]:
import csv
import pandas as pd
import random
import spacy
import nltk

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']


read csv and concat title and content

In [123]:
dataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    dataset = pd.concat([dataset, df[['text', 'category']]])

#shuffle row
dataset=dataset.sample(frac=1).reset_index(drop=True)
print(dataset)

                                                   text       category
0     Bills receiver Cole Beasley has been fined mul...         sports
1     Kaishu Hirano goes sky high during the Men’s S...         sports
2     Will Smith Did a Bad, Bad Thing. Slapping Chri...  entertainment
3     Russian Stocks Tumble Most Since Crimea Annexa...       business
4     Trump Funding Network Paid $4.3 Million To Peo...       politics
...                                                 ...            ...
5936                           Solo Trip to Lebanon 🇱🇧          travel
5937  'Fire DeJoy' Demand Intensifies as 10-Year Pla...       politics
5938  California grocery workers vote to authorize s...       business
5939  Historic Leak of Swiss Banking Records Reveals...       business
5940  Jonathan Groff Calls Keanu Reeves the 'Greates...  entertainment

[5941 rows x 2 columns]


preprocessing

In [124]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import string

#make all characters to lower case
dataset['text']=dataset['text'].apply(lambda x: str.lower(x))

#word tokenization
dataset['text'] = dataset['text'].apply(lambda x: wordpunct_tokenize(x))

#delete stopwords and punctuation
stopwordList = set(stopwords.words('english') + list(string.punctuation))
dataset['text'] = dataset['text'].apply(lambda x: list(word for word in x if word.isalpha()))

print(dataset)

                                                   text       category
0     [bills, receiver, cole, beasley, has, been, fi...         sports
1     [kaishu, hirano, goes, sky, high, during, the,...         sports
2     [will, smith, did, a, bad, bad, thing, slappin...  entertainment
3     [russian, stocks, tumble, most, since, crimea,...       business
4     [trump, funding, network, paid, million, to, p...       politics
...                                                 ...            ...
5936                          [solo, trip, to, lebanon]         travel
5937  [fire, dejoy, demand, intensifies, as, year, p...       politics
5938  [california, grocery, workers, vote, to, autho...       business
5939  [historic, leak, of, swiss, banking, records, ...       business
5940  [jonathan, groff, calls, keanu, reeves, the, g...  entertainment

[5941 rows x 2 columns]


make bag of words column

In [125]:
from collections import defaultdict
from nltk.probability import FreqDist

allTextList = sum(dataset['text'].tolist(),[])
numOfdict = len(set(allTextList))
print('the number of word dict: ',numOfdict)


the number of word dict:  17677


In [126]:
#the number of words to use for training
numOfFeatureWords = numOfdict


fdist = FreqDist(allTextList)
wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))
def bagOfWords(tokens):
    d = defaultdict(int,{ word:0 for word in wordDict })
    for token in tokens:
        d[token]+=1
    ret = []
    for key, val in d.items():
        ret.append(val)
    return ret[:numOfFeatureWords]

dataset['bagOfWords'] = dataset['text'].apply(lambda x: bagOfWords(x))

print(dataset['bagOfWords'])

0       [0, 1, 3, 1, 2, 0, 1, 3, 0, 0, 2, 0, 0, 0, 0, ...
1       [0, 0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...
2       [0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
5936    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5937    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5938    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5939    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
5940    [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
Name: bagOfWords, Length: 5941, dtype: object


divide trainset and testset

In [127]:
trainSplit = 0.8
testSplit = 0.2

datasetSize = dataset.shape[0]

trainSize = int(datasetSize * trainSplit)
testSize = int(datasetSize * testSplit)

print('trainSize: ', trainSize)
print('testSize: ', testSize)

trainSet = dataset.iloc[:trainSize, :]
testSet = dataset.iloc[trainSize:, :]

print(trainSet.shape[0])
print(testSet.shape[0])

trainSize:  4752
testSize:  1188
4752
1189


build a Bag of Words model

In [128]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
classifier = GaussianNB()
trainX = np.array(trainSet['bagOfWords'].tolist())
trainY = np.array(trainSet['category'].tolist())
print(trainX.shape, trainY.shape)
classifier.fit(trainX, trainY)

(4752, 17677) (4752,)


GaussianNB()

In [129]:
from sklearn.metrics import accuracy_score

testX = np.array(testSet['bagOfWords'].tolist())
testY = np.array(testSet['category'].tolist())
print(testX.shape, testY.shape)
predY = classifier.predict(testX)
accuracy = accuracy_score(testY, predY)
print('accuracy :', accuracy)

(1189, 17677) (1189,)
accuracy : 0.7981497056349874
