In [131]:
import csv
import pandas as pd
import random
import spacy
import nltk

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']


read csv and concat title and content

In [132]:
dataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    dataset = pd.concat([dataset, df[['text', 'category']]])

#shuffle row
dataset=dataset.sample(frac=1).reset_index(drop=True)
print(dataset)

                                                   text       category
0     Kyoto with no Tourist feels so strange but odd...         travel
1     Israeli soccer fans subjected to antisemitic a...         sports
2     Requiring Vaccination to See New Baby My husba...      parenting
3     ABBA’s greatest hits album, “ABBA Gold: Greate...  entertainment
4     Jason Momoa already wants a 'Villeneuve Cut' o...  entertainment
...                                                 ...            ...
5936  Another cool-down post - Photos from my four c...         travel
5937  My transatlantic flight today was the emptiest...         travel
5938                         Edinburgh, Scotland. 2022          travel
5939  Cristiano Ronaldo scores in his first Champion...         sports
5940  George Bush delights Democrats, infuriates MAG...       politics

[5941 rows x 2 columns]


preprocessing

In [133]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import string

#make all characters to lower case
dataset['text']=dataset['text'].apply(lambda x: str.lower(x))

#word tokenization
dataset['text'] = dataset['text'].apply(lambda x: wordpunct_tokenize(x))

#delete stopwords and punctuation
stopwordList = set(stopwords.words('english') + list(string.punctuation))
dataset['text'] = dataset['text'].apply(lambda x: list(word for word in x if word.isalpha()))

print(dataset)

                                                   text       category
0     [kyoto, with, no, tourist, feels, so, strange,...         travel
1     [israeli, soccer, fans, subjected, to, antisem...         sports
2     [requiring, vaccination, to, see, new, baby, m...      parenting
3     [abba, s, greatest, hits, album, abba, gold, g...  entertainment
4     [jason, momoa, already, wants, a, villeneuve, ...  entertainment
...                                                 ...            ...
5936  [another, cool, down, post, photos, from, my, ...         travel
5937  [my, transatlantic, flight, today, was, the, e...         travel
5938                              [edinburgh, scotland]         travel
5939  [cristiano, ronaldo, scores, in, his, first, c...         sports
5940  [george, bush, delights, democrats, infuriates...       politics

[5941 rows x 2 columns]


make bag of words column

In [134]:
from collections import defaultdict
from nltk.probability import FreqDist

allTextList = sum(dataset['text'].tolist(),[])
numOfdict = len(set(allTextList))
print('the number of word dict: ',numOfdict)


the number of word dict:  17677


In [173]:
#the number of words to use for training
numOfFeatureWords = int(numOfdict*0.95)

fdist = FreqDist(allTextList)

wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))
def bagOfWords(tokens):
    d = defaultdict(int,{ word:0 for word in wordDict })
    for token in tokens:
        d[token]+=1
    ret = []
    for key, val in d.items():
        ret.append(val)
    return ret[:numOfFeatureWords]

dataset['bagOfWords'] = dataset['text'].apply(lambda x: bagOfWords(x))

print(dataset['bagOfWords'])

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2       [3, 8, 4, 5, 2, 5, 3, 1, 3, 0, 1, 3, 2, 2, 0, ...
3       [0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
4       [2, 4, 1, 0, 2, 0, 2, 0, 1, 1, 0, 0, 0, 1, 0, ...
                              ...                        
5936    [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5937    [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5938    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5939    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
5940    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bagOfWords, Length: 5941, dtype: object


divide trainset and testset

In [174]:
trainSplit = 0.7
testSplit = 0.3

datasetSize = dataset.shape[0]

trainSize = int(datasetSize * trainSplit)
testSize = int(datasetSize * testSplit)

print('trainSize: ', trainSize)
print('testSize: ', testSize)

trainSet = dataset.iloc[:trainSize, :]
testSet = dataset.iloc[trainSize:, :]

print(trainSet.shape[0])
print(testSet.shape[0])

trainSize:  4158
testSize:  1782
4158
1783


build a Bag of Words model

In [175]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
classifier = MultinomialNB()
trainX = np.array(trainSet['bagOfWords'].tolist())
trainY = np.array(trainSet['category'].tolist())
print(trainX.shape, trainY.shape)
classifier.fit(trainX, trainY)

(4158, 16793) (4158,)


MultinomialNB()

In [176]:
from sklearn.metrics import accuracy_score

testX = np.array(testSet['bagOfWords'].tolist())
testY = np.array(testSet['category'].tolist())
print(testX.shape, testY.shape)
predY = classifier.predict(testX)
predYProb = classifier.predict_proba(testX)
accuracy = accuracy_score(testY, predY)
print('accuracy :', accuracy)
print(predYProb)

(1783, 16793) (1783,)
accuracy : 0.8485698261357263
[[1.82887945e-01 4.47672959e-02 1.19445014e-04 1.04053667e-01
  8.88350292e-02 5.79336617e-01]
 [2.04454400e-18 1.69851805e-16 6.88674533e-06 3.32998752e-14
  4.87969458e-15 9.99993113e-01]
 [5.53408307e-05 1.11941058e-04 1.84667887e-06 1.79680225e-05
  9.99768537e-01 4.43662885e-05]
 ...
 [1.27074265e-01 5.76118036e-02 1.16244992e-03 5.41898684e-02
  2.28557406e-01 5.31404206e-01]
 [1.29070918e-09 1.37034487e-09 4.65720951e-14 2.26428064e-11
  9.99999997e-01 1.50221736e-10]
 [2.21910238e-04 3.20877405e-04 1.71759406e-09 9.98559876e-01
  8.18865347e-04 7.84692956e-05]]
