In [1]:
'''
-   nltk.download('punkt'): download the Punkt tokenizer models, which are pre-trained models used 
    for tokenization (Tokenization is the process of breaking down text into smaller units).
'''
nltk.download('punkt')


'''
-   nltk.download('punkt'): download a specific resource from the NLTK (Natural Language Toolkit) library, 
    namely the "stopwords" corpus.
-   Stopwords are common words in a language that are often considered unimportant in the context of natural language processing (NLP) tasks,
    removing stopwords helps in reducing the noise in the data and focusing on the words that are more meaningful for the specific NLP task. 
'''
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
'''
-   Download the dataset
'''

!gdown 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: d:\IT_stuffs\Project\Python\Text_Classification\2cls_spam_text_cls.csv

  0%|          | 0.00/486k [00:00<?, ?B/s]
100%|██████████| 486k/486k [00:00<00:00, 3.00MB/s]
100%|██████████| 486k/486k [00:00<00:00, 2.98MB/s]


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import nltk


In [13]:
def lowercase(text):

    return text.lower()


def punctuation_removal(text):
    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)


def tokenize(text):

    return nltk.word_tokenize(text)


def remove_stopword(tokens):
    stop_words = nltk.corpus.stopwords.words('english')

    return [token for token in tokens if token not in stop_words]


def stemming(tokens):
    stemmer = nltk.PorterStemmer()

    return [stemmer.stem(token) for token in tokens]


def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens)

    return tokens


def create_dictionary(messages):
    dictionary = []
    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary


def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

In [14]:
data_path = '2cls_spam_text_cls.csv'
df = pd.read_csv(data_path)
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()
messages = [preprocess_text(message) for message in messages]
dictionary = create_dictionary(messages)
X = np.array([create_features(tokens, dictionary) for tokens in messages])
le = LabelEncoder()
y = le.fit_transform(labels)

In [15]:
VAL_size = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=VAL_size,
                                                  shuffle=True,
                                                  random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size = TEST_SIZE,
                                                    shuffle =True,
                                                    random_state = SEED)

In [16]:
model = GaussianNB()
print('Start training ... ')
model = model.fit(X_train, y_train)
print('Training completed!')

Start training ... 
Training completed!


In [17]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [18]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

In [19]:
text_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(text_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
