In [2]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.model_selection import train_test_split ## since we already have splitted dataset, therefore we dont really need this
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

In [3]:
trainPath = 'train.csv'
testPath = 'test.csv'
label = ['World', 'Sports', 'Business', 'Sci/Tech']

In [48]:
realTest = '/Users/kycdia/Downloads'
realArr = np.array([str(dir) for dir in os.listdir(realTest)])

In [4]:
trainDf = pd.read_csv(trainPath)
testDf = pd.read_csv(testPath)
# trainDf['label'] = trainDf['label'].map(label)
# testDf['label'] = label[testDf['label']]

In [50]:
trainDf.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [51]:
testDf.head()

Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3


In [5]:
trainInput = trainDf['text']
trainLabel = trainDf['label']
testInput = testDf['text']
testLabel = testDf['label']

In [53]:
countVec = CountVectorizer()

xTrain = countVec.fit_transform(trainInput)
xTest = countVec.transform(testInput)
yTrain = trainLabel.values
yTest = testLabel.values

In [54]:
model1 = MultinomialNB()
model1.fit(xTrain, yTrain)
print(model1.score(xTrain, yTrain))
print(model1.score(xTest, yTest))

0.9172666666666667
0.900921052631579


In [66]:
countVec2 = CountVectorizer(stop_words='english')
xTrain = countVec2.fit_transform(trainInput)
xTest = countVec2.transform(testInput)
model2 = MultinomialNB()
model2.fit(xTrain, yTrain)
print(model2.score(xTrain, yTrain))
print(model2.score(xTest, yTest))

0.9202166666666667
0.9044736842105263


In [7]:
class lemmaToken():
    def __init__(self) -> None:
        self.wordnetLemma = WordNetLemmatizer()
        self.stopWords = set(stopwords.words('english'))
    def getPos(self, tags:str):
        if tags.startswith('J'):
            return wordnet.ADJ
        elif tags.startswith('V'):
            return wordnet.VERB
        elif tags.startswith('N'):
            return wordnet.NOUN
        elif tags.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        wordsNtokens = nltk.pos_tag(tokens)
        return [self.wordnetLemma.lemmatize(word, pos=self.getPos(tag)) for word, tag in wordsNtokens if word.lower() not in self.stopWords]

In [85]:
countVec3 = CountVectorizer(stop_words='english', tokenizer=lemmaToken())
xTrain = countVec3.fit_transform(trainInput)
xTest = countVec3.transform(testInput)
model3 = MultinomialNB()
model3.fit(xTrain, yTrain)
print(model3.score(xTrain, yTrain))
print(model3.score(xTest, yTest))



0.91925
0.9030263157894737


In [86]:
trainingScore = model3.score(xTrain, yTrain)
testScore = model3.score(xTest, yTest)
print(f'training score {trainingScore}')
print(f'test score {testScore}')
print(f'score deviation {trainingScore-testScore}')

training score 0.91925
test score 0.9030263157894737
score deviation 0.016223684210526335


In [8]:
tfIdfVec = TfidfVectorizer(tokenizer=lemmaToken())
xTrain = tfIdfVec.fit_transform(trainInput)
xTest = tfIdfVec.transform(testInput)
model4 = MultinomialNB()
model4.fit(xTrain, yTrain)
print(model4.score(xTrain, yTrain))
print(model4.score(xTest, yTest))



NameError: name 'yTrain' is not defined

In [89]:
trainingScore = model4.score(xTrain, yTrain)
testScore = model4.score(xTest, yTest)
print(f'training score {trainingScore}')
print(f'test score {testScore}')
print(f'score deviation {trainingScore-testScore}')

training score 0.9183916666666667
test score 0.9032894736842105
score deviation 0.015102192982456186


In [91]:
joblib.dump(model4, 'NB_lemma_Tfidf.pkl')

['NB_lemma_Tfidf.pkl']

In [93]:
cobaModel = joblib.load('NB_lemma_Tfidf.pkl')
cobaModel.score(xTest, yTest)

0.9032894736842105

In [94]:
realArrTest = tfIdfVec.transform(realArr)

In [96]:
realArrVal = model4.predict(realArrTest)

In [97]:
for i, ttl in enumerate(realArrVal):
    print(f'class {label[ttl]} with the query of {realArr[i]}')

class Sci/Tech with the query of Preview Transkrip.pdf
class World with the query of Birokrasi Complex.mp4
class World with the query of Montgomery_Fleet_Equipment_Inventory_FA_PART_1_END.xlsx
class Sci/Tech with the query of Bukti daftar form ICOICT.pdf
class World with the query of googlechrome.dmg
class Business with the query of Revisi 2 - PROPOSAL PENELITIAN indra (1).pdf
class World with the query of DickyA_Resume.pdf
class World with the query of cleanedData.xlsx
class World with the query of output2_video-3.mp4
class World with the query of Montgomery_Fleet_Equipment_Inventory_FA_PART_2_START.xlsx
class Sci/Tech with the query of WhatsApp Image 2023-11-23 at 10.05.46 PM.jpeg
class World with the query of output2_video-2.mp4
class World with the query of 1507.05717.pdf
class Business with the query of Data Freeways.csv
class World with the query of LAPORAN SERTIJAB.docx
class Sci/Tech with the query of WhatsApp Image 2023-06-10 at 12.57.05.jpeg
class World with the query of CarS

In [110]:
a = 'nama'
b = 'bulan'
c = 'tahun'
print('_'.join([a,b,c]))

nama_bulan_tahun


In [10]:
joblib.dump(tfIdfVec, 'tokenizer.pkl')

['tokenizer.pkl']

In [11]:
cobaTokenizer = joblib.load('tokenizer.pkl')
cobaLagi = cobaTokenizer.transform(testInput)
cobaLagi

<7600x90831 sparse matrix of type '<class 'numpy.float64'>'
	with 196244 stored elements in Compressed Sparse Row format>