In [2]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tuann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tuann\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [14]:
df= pd.read_csv(dtype='2cls_spam_text_cls.csv')

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [5]:
messages= df['Message'].values.tolist()
labels= df['Category'].values.tolist()

In [9]:
# Preprocessing
def lower_case(text):
    return text.lower()

def punctuation_removal(text):
    translator= str.maketrans('','', string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words= nltk.corpus.stopwords.words('english')
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer= nltk.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [10]:
def preprocess_text(text):
    text= lower_case(text)
    text= punctuation_removal(text)
    tokens= tokenize(text)
    tokens= remove_stopwords(tokens)
    tokens= stemming(tokens)
    return tokens

In [11]:
messages= [preprocess_text(message) for message in messages]

In [12]:
def create_dictionary(messages):
    dictionary=[]
    for tokens in messages:
        for token in tokens:
            if token not in dict:
                dictionary.append(token)
    return dictionary

In [16]:
dictionary= create_dictionary(messages)

In [13]:
def create_feature(tokens, dictionary):
    features= np.zeros(len(dictionary))
    for token in tokens:
        features[dictionary.index(token)] += 1
    return features

In [17]:
X= np.array([create_feature(tokens,dictionary) for tokens in messages])

In [15]:
# Tien xy ly du lieu nhan
le = LabelEncoder()
y= le.fit_transform(labels)
print(f'Classes:{le.classes_}')
print(f'Encoded labels:{y}')

Classes:['ham' 'spam']
Encoded labels:[0 0 1 ... 0 0 0]


In [18]:
VAL_SIZE= 0.2
TEST_SIZE=0.125
SEED= 0
x_train, x_val,y_train, y_val= train_test_split(X, y,
                                                test_size= VAL_SIZE,
                                                shuffle= True,
                                                random_state= SEED)
x_train, x_test, y_train, y_test= train_test_split(x_train,y_train,
                                                   test_size=TEST_SIZE,
                                                   shuffle=True,
                                                   random_state= SEED)

In [22]:
#train model
model = GaussianNB()
print('Start training...')
model = model.fit(x_train, y_train)
print('Training completed!')

Start training...
Training completed!


In [23]:
# Evaluate Model
y_val_pred= model.predict(x_val)
y_test_pred= model.predict(x_test)
val_accuracy= accuracy_score(y_val, y_val_pred)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [26]:
# Apply
def predict(text, model, dictionary):
    proccesed_text= preprocess_text(text)
    features= create_feature(proccesed_text, dictionary)
    features= np.array(features).reshape(1, -1)
    prediction= model.predict(features)
    prediction_label= le.inverse_transform(prediction)[0]
    return prediction_label

In [27]:
test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
