import needed library and declare list of category column

In [122]:

import csv
import pandas as pd
import random
import nltk
from nltk.tokenize import wordpunct_tokenize
from collections import defaultdict
from nltk.probability import FreqDist
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.metrics import accuracy_score

crawledDataSetNames = ['dataset-reddit-business.csv', 'dataset-reddit-entertainment.csv', 
'dataset-reddit-parenting.csv', 'dataset-reddit-politics.csv', 
'dataset-reddit-sports.csv', 'dataset-reddit-travel.csv']

categories = ['business', 'entertainment', 'parenting', 'politics', 'sports', 'travel']




preprocessing function

In [123]:
def preprocessing(originData):
    data = originData.copy()
    #make all characters to lower case
    data['text']=data['text'].apply(lambda x: str.lower(x))

    #word tokenization
    data['text'] = data['text'].apply(lambda x: wordpunct_tokenize(x))

    #delete non alphabets word
    data['text'] = data['text'].apply(lambda x: list(word for word in x if word.isalpha()))

    print('preprocessing finished. 1 / 6')
    return data

declare making word dictionary function and making Bag Of words Column function.

In [124]:
def makingWordDict(data):
    listedText=data['text'].tolist()
    allTextList = []
    for l in listedText:
        allTextList.extend(l)
    numOfdict = len(set(allTextList))
    
    numOfFeatureWords = int(numOfdict*0.95)

    fdist = FreqDist(allTextList)

    wordDict = list(word for word, freq in fdist.most_common(numOfFeatureWords))

    print('making word dictionary finished. 2 / 6')
    return wordDict

def makingBagOfWordsCol(data, wordDict):

    def bagOfWords(tokens):
        d = defaultdict(int,{ word:0 for word in wordDict })

        for token in tokens:
            d[token]+=1

        return list(d.values())[:len(wordDict)]

    data['bagOfWords'] = data['text'].apply(lambda x: bagOfWords(x))

    print('making bag of words column finished. 3 / 6')
    return data

declare divide trainset and testset function.

In [125]:
def divideTrainAndTest(data):
    dataSize = data.shape[0]

    trainSplit = 0.7
    testSplit = 0.3

    trainSize = int(dataSize * trainSplit)
    testSize = int(dataSize * testSplit)

    print('trainSize: ', trainSize)
    print('testSize: ', testSize)

    trainSet = data.iloc[:trainSize, :]
    testSet = data.iloc[trainSize:, :]

    print('dividing train and test set finished. 4 / 6')
    return trainSet, testSet

declare build a Bag of Words model and fit by using train set.

In [126]:
def buildAndFitModel(trainSet):
    classifier = MultinomialNB()
    trainX = np.array(trainSet['bagOfWords'].tolist())
    trainY = np.array(trainSet['category'].tolist())
    
    classifier.fit(trainX, trainY)

    print('building and fitting the model finished. 5 / 6')
    return classifier

declare evaluate function.

In [127]:
def evaluate(classifier, testSet):
    testX = np.array(testSet['bagOfWords'].tolist())
    testY = np.array(testSet['category'].tolist())
    
    predY = classifier.predict(testX)
    predYProb = classifier.predict_proba(testX)
    accuracy = accuracy_score(testY, predY)

    print('evaluating the model finished. 6 / 6')
    return accuracy, predYProb

1. read the crawled Dataset of reddit.

In [128]:
rawRedditDataset = pd.DataFrame(columns = {'text','category'})
for crawledDataSetName in crawledDataSetNames:
    #read csv files
    df = pd.read_csv('./dataset/%s'%crawledDataSetName)
    
    #replace Nan to empty string
    df = df.fillna('')

    #'titleAndContent' Column is concat of 'title' and 'content'
    df['text']=df['title']+' '+df['content']
    rawRedditDataset = pd.concat([rawRedditDataset, df[['text', 'category']]])

rawRedditDataset=rawRedditDataset[['text', 'category']]
#shuffle row
rawRedditDataset=rawRedditDataset.sample(frac=1).reset_index(drop=True)

print(rawRedditDataset)

                                                   text       category
0     Former Jets, Rams RB Zac Stacy has been arrest...         sports
1     Should i travel the world alone as a woman if ...         travel
2     Netflix eyes foray into video games: the strea...       business
3     Grimes wanted to direct ‘Dune,’ says she was f...  entertainment
4     Jon Voight, Outspoken GOP Supporter, Calls For...  entertainment
...                                                 ...            ...
5936  Alex Jones sat with the Jan. 6 panel and repea...       politics
5937    Armenia, little country full of big monuments.          travel
5938  Amber Heard Rolls Her Eyes After Johnny Depp's...  entertainment
5939  Not Yet Known Whether Charges Will Be Filed In...  entertainment
5940  My parents would rather not meet their first g...      parenting

[5941 rows x 2 columns]


2. read the news category dataset(from kaggle).

In [129]:
originNewsCategory = ['BUSINESS', 'ENTERTAINMENT', 'PARENTING', 'POLITICS', 'SPORTS', 'TRAVEL']

rawNewsDataset = pd.read_json("./dataset/News_category_Dataset_v2.json", lines=True)
rawNewsDataset = rawNewsDataset[rawNewsDataset['category'].isin(originNewsCategory)]
rawNewsDataset = rawNewsDataset.groupby('category').sample(n=2000)
rawNewsDataset['category'] = rawNewsDataset['category'].apply(lambda x: x.lower())

rawNewsDataset['text'] = rawNewsDataset['headline'] + ' ' + rawNewsDataset['short_description']
rawNewsDataset = rawNewsDataset[['text','category']]

#shuffle row
rawNewsDataset=rawNewsDataset.sample(frac=1).reset_index(drop=True)

print(rawNewsDataset)

                                                    text       category
0      Darren Sharper May Have Penis Monitored As Par...         sports
1      Slavic Cool? “Slavic cool” is everywhere these...         travel
2      FIFA Whistleblower Chuck Blazer Dead At 72 He ...         sports
3      5 Things Not to Say to a Pregnant Woman Perhap...      parenting
4      How The Volatile Setting Of Netflix's 'Ozark' ...  entertainment
...                                                  ...            ...
11995  Robin Thicke's Acting Debut Will Make You Crin...  entertainment
11996  Simone Biles Makes History With 4th Consecutiv...         sports
11997  Spilled Milk: Photoshopping My Neck Once upon ...      parenting
11998  Winnie The Pooh Is Trending On Twitter For The...         sports
11999  Top GOP Operative: Mike Pence Once Thought Tru...       politics

[12000 rows x 2 columns]


3. the model only using reddit crawled dataset.

In [130]:
redditDataset = preprocessing(rawRedditDataset)
redditWordDict = makingWordDict(redditDataset)
BOWredditDataset = makingBagOfWordsCol(redditDataset, redditWordDict)

redditTrainSet, redditTestSet = divideTrainAndTest(BOWredditDataset)

redditClassifier = buildAndFitModel(redditTrainSet)

redditAccuracy, redditPredYProb = evaluate(redditClassifier, redditTestSet)

print('only reddit dataset model accuracy: ', redditAccuracy)

preprocessing finished. 1 / 6
making word dictionary finished. 2 / 6
making bag of words column finished. 3 / 6
trainSize:  4158
testSize:  1782
dividing train and test set finished. 4 / 6
building and fitting the model finished. 5 / 6
evaluating the model finished. 6 / 6
only reddit dataset model accuracy:  0.8424004486819966


4. the model only using news dataset.

In [131]:
newsDataset = preprocessing(rawNewsDataset)
newsWordDict = makingWordDict(newsDataset)
BOWnewsDataset = makingBagOfWordsCol(newsDataset, newsWordDict)

newsTrainSet, newsTestSet = divideTrainAndTest(BOWnewsDataset)

newsClassifier = buildAndFitModel(newsTrainSet)

newsAccuracy, newsPredYProb = evaluate(newsClassifier, newsTestSet)

print('only news dataset model accuracy: ', newsAccuracy)

preprocessing finished. 1 / 6
making word dictionary finished. 2 / 6
making bag of words column finished. 3 / 6
trainSize:  8400
testSize:  3600
dividing train and test set finished. 4 / 6
building and fitting the model finished. 5 / 6
evaluating the model finished. 6 / 6
only news dataset model accuracy:  0.7833333333333333


5. the model using both reddit and news datasets.