In [2]:
import csv
import pandas as pd
import random
import spacy
import nltk

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']




read csv and concat title and content

In [3]:
dataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    dataset = pd.concat([dataset, df[['text', 'category']]])

#shuffle row
dataset=dataset.sample(frac=1).reset_index(drop=True)
print(dataset)

       category                                               text
0      politics  It’s clear capitalism isn’t working when US po...
1      business  Nestle to suspend many products in Russia incl...
2      politics  Capitol rioters called Nancy Pelosi's office l...
3      business  Johnson & Johnson Plans to Split Into Two Publ...
4        travel                                 My trip to Egypt! 
...         ...                                                ...
5936   business  Peloton insiders sold nearly $500 million in s...
5937     travel                      Cotswolds! Quite picturesque 
5938     travel  Just came back from a trip to Amsterdam and it...
5939  parenting  is it ok to check the baby monitor at night? m...
5940   business  Self-driving Waymo trucks to haul loads betwee...

[5941 rows x 2 columns]


preprocessing

In [4]:
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import string
def preprocessing(data):
    #make all characters to lower case
    data['text']=data['text'].apply(lambda x: str.lower(x))

    #word tokenization
    data['text'] = data['text'].apply(lambda x: wordpunct_tokenize(x))

    #delete stopwords and punctuation
    stopwordList = set(stopwords.words('english') + list(string.punctuation))
    data['text'] = data['text'].apply(lambda x: list(word for word in x if word.isalpha()))

    print(data)
    return data

dataset = preprocessing(dataset)

       category                                               text
0      politics  [it, s, clear, capitalism, isn, t, working, wh...
1      business  [nestle, to, suspend, many, products, in, russ...
2      politics  [capitol, rioters, called, nancy, pelosi, s, o...
3      business  [johnson, johnson, plans, to, split, into, two...
4        travel                              [my, trip, to, egypt]
...         ...                                                ...
5936   business  [peloton, insiders, sold, nearly, million, in,...
5937     travel                    [cotswolds, quite, picturesque]
5938     travel  [just, came, back, from, a, trip, to, amsterda...
5939  parenting  [is, it, ok, to, check, the, baby, monitor, at...
5940   business  [self, driving, waymo, trucks, to, haul, loads...

[5941 rows x 2 columns]


In [5]:
from collections import defaultdict
from nltk.probability import FreqDist

def makingBagOfWordsCol(data):
    allTextList = sum(data['text'].tolist(),[])
    numOfdict = len(set(allTextList))
    print('the number of word dict: ',numOfdict)
    
    #the number of words to use for training
    numOfFeatureWords = int(numOfdict*0.95)

    fdist = FreqDist(allTextList)

    wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))

    def bagOfWords(tokens):
        d = defaultdict(int,{ word:0 for word in wordDict })

        for token in tokens:
            d[token]+=1

        ret = []

        for key, val in d.items():
            ret.append(val)

        return ret[:numOfFeatureWords]

    data['bagOfWords'] = data['text'].apply(lambda x: bagOfWords(x))
    return data

dataset = makingBagOfWordsCol(dataset)

print(dataset['bagOfWords'])

the number of word dict:  17677
0       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, ...
1       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...
3       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
5936    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
5937    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5938    [1, 2, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...
5939    [15, 8, 14, 10, 11, 7, 5, 5, 4, 0, 3, 6, 11, 4...
5940    [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: bagOfWords, Length: 5941, dtype: object


divide trainset and testset

In [6]:
trainSplit = 0.7
testSplit = 0.3

datasetSize = dataset.shape[0]

trainSize = int(datasetSize * trainSplit)
testSize = int(datasetSize * testSplit)

print('trainSize: ', trainSize)
print('testSize: ', testSize)

trainSet = dataset.iloc[:trainSize, :]
testSet = dataset.iloc[trainSize:, :]

print(trainSet.shape[0])
print(testSet.shape[0])

trainSize:  4158
testSize:  1782
4158
1783


build a Bag of Words model

In [7]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
classifier = MultinomialNB()
trainX = np.array(trainSet['bagOfWords'].tolist())
trainY = np.array(trainSet['category'].tolist())
print(trainX.shape, trainY.shape)
classifier.fit(trainX, trainY)

(4158, 16793) (4158,)


MultinomialNB()

In [8]:
from sklearn.metrics import accuracy_score

testX = np.array(testSet['bagOfWords'].tolist())
testY = np.array(testSet['category'].tolist())
print(testX.shape, testY.shape)
predY = classifier.predict(testX)
predYProb = classifier.predict_proba(testX)
accuracy = accuracy_score(testY, predY)
print('accuracy :', accuracy)
print(predYProb)

(1783, 16793) (1783,)
accuracy : 0.8536174985978687
[[2.30842083e-032 1.29014272e-030 9.99999970e-001 5.93177885e-033
  2.81372117e-040 3.01653441e-008]
 [9.66655711e-001 2.38529777e-002 9.49058345e-007 3.21792043e-003
  6.02899141e-003 2.43450098e-004]
 [1.47046059e-006 3.72800759e-003 1.12150299e-008 9.96270068e-001
  3.95971623e-007 4.63220606e-008]
 ...
 [4.06442601e-012 1.23649827e-010 9.17573238e-001 6.66788001e-012
  1.09827772e-012 8.24267617e-002]
 [3.04510346e-300 2.41853358e-259 1.00000000e+000 1.04319799e-262
  9.97469132e-320 2.12884942e-192]
 [8.81282946e-001 1.68604498e-002 3.19169407e-005 2.95384142e-003
  1.12718492e-002 8.75989969e-002]]


read news category dataset and concat headline and short_description

In [9]:
originNewsCategory = ['BUSINESS', 'ENTERTAINMENT', 'PARENTING', 'POLITICS', 'SPORTS', 'TRAVEL']

newsDataset = pd.read_json("./dataset/News_category_Dataset_v2.json", lines=True)
newsDataset = newsDataset[newsDataset['category'].isin(originNewsCategory)]
newsDataset['category'] = newsDataset['category'].apply(lambda x: x.lower())

newsDataset['text'] = newsDataset['headline'] + ' ' + newsDataset['short_description']
newsDataset = newsDataset[['text','category']]
#shuffle row
newsDataset=newsDataset.sample(frac=1).reset_index(drop=True)
print(newsDataset['text'])

0        Who Would Win in a Fight Between Iron Man and ...
1        Here's The 2015 March Madness Schedule For Sat...
2        14 Wineries Where You Can Stay the Night (PHOT...
3        Mike Pence To America: Trump Never Said Those ...
4        Flight Attendant’s Hilarious In-Flight Safety ...
                               ...                        
78177    Getting Our Pennies in Priority for Tax Day We...
78178    Travel Ban Is A Minor Win For Trump And A Majo...
78179    Iceland: 3 Mistakes Many Travelers Make After ...
78180    Republicans Hold On To Arizona House Seat That...
78181    Top GOP Senator Challenges Trump Arms Deals Ov...
Name: text, Length: 78182, dtype: object


preprossesing and make bag of words columns.

In [10]:
newsDataset = preprocessing(newsDataset)

newsDataset = makingBagOfWordsCol(newsDataset)