In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import *
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dphng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dphng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dphng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
DATASET_PATH = 'SPAM text message 20170820 - Data.csv'
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [18]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [19]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [20]:
# Create a TF-IDF vectorizer and fit it to the messages
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(messages)

# Now X contains the TF-IDF features, and y contains the encoded labels
print(f'Shape of TF-IDF matrix: {X.shape}')

Shape of TF-IDF matrix: (5572, 8709)


In [27]:
row_index = 0  # Replace with the desired row index
row = X.getrow(row_index)

# Print the non-zero elements and their corresponding column indices
print(f'Non-zero elements in row {row_index}: {row}')
print(f'Indices of non-zero elements: {row.indices}')
print(f'Values of non-zero elements: {row.data}')

Non-zero elements in row 0:   (0, 3567)	0.14786475068980162
  (0, 8080)	0.2300034410835773
  (0, 4370)	0.32645117023873077
  (0, 5954)	0.25533539230157154
  (0, 2334)	0.25281395947472957
  (0, 1313)	0.24417482890859654
  (0, 5567)	0.156366230319576
  (0, 4110)	0.10707657674366398
  (0, 1763)	0.2757872678027423
  (0, 3651)	0.1803318063070826
  (0, 8544)	0.22081883351949952
  (0, 4497)	0.2757872678027423
  (0, 1761)	0.31163292870610654
  (0, 2057)	0.2757872678027423
  (0, 7690)	0.15550627816331297
  (0, 3611)	0.15304155020494287
  (0, 1079)	0.32645117023873077
  (0, 8320)	0.18240101628302693
Indices of non-zero elements: [3567 8080 4370 5954 2334 1313 5567 4110 1763 3651 8544 4497 1761 2057
 7690 3611 1079 8320]
Values of non-zero elements: [0.14786475 0.23000344 0.32645117 0.25533539 0.25281396 0.24417483
 0.15636623 0.10707658 0.27578727 0.18033181 0.22081883 0.27578727
 0.31163293 0.27578727 0.15550628 0.15304155 0.32645117 0.18240102]


In [28]:
VAL_SIZE = 0.1
TEST_SIZE = 0.2
SEED = 0

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    shuffle=True,
                                                    random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=VAL_SIZE,
                                                  shuffle=True,
                                                  random_state=SEED)

In [29]:
%%time
# model = GaussianNB()
model = BernoulliNB()
print('Start training...')
model = model.fit(X_train, y_train)
print('Training completed!')

Start training...
Training completed!
CPU times: total: 0 ns
Wall time: 11 ms


In [30]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.9753363228699552
Test accuracy: 0.979372197309417


In [31]:
def predict_class(sentence, model, vectorizer):
    # Transform the input sentence using the TF-IDF vectorizer
    X_new = vectorizer.transform([sentence])

    # Predict the class using the trained model
    predicted_label = model.predict(X_new)[0]

    # If you need to return the actual class name instead of encoded label
    predicted_class = le.inverse_transform([predicted_label])[0]

    return predicted_class

In [33]:
# Define a new sentence
new_sentence = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Predict the class of the new sentence
predicted_class = predict_class(new_sentence, model, tfidf_vectorizer)

# Print the predicted class
print(f'Predicted class: {predicted_class}')

Predicted class: spam
