In [215]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations

# Read the CSV file into a DataFrame
df = pd.read_csv("spam.csv")  

# Display the first few rows of the DataFrame to understand its structure
df.head()  

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [216]:
# Get the count of each unique value in the 'Category' column
df.Category.value_counts()  

Category
ham     4825
spam     747
Name: count, dtype: int64

In [217]:
# Create a new column 'spam' where 1 indicates spam and 0 indicates not spam
df["spam"] = df["Category"].apply(lambda x: 1 if x=="spam" else 0)  

# Display the updated DataFrame with the new 'spam' column
df.head() 

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [218]:
from sklearn.model_selection import train_test_split  # Import function to split data into training and test sets

# Split the data into training and test sets, with 20% of the data used for testing
X_train, X_test, Y_train, Y_test = train_test_split(df.Message, df.spam, test_size = 0.2)  

# Check the shape of the training and test datasets
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape  

((4457,), (1115,), (4457,), (1115,))

In [219]:
# Check the shape of the original DataFrame
df.shape 

(5572, 3)

In [220]:
# Display a slice of the training messages
X_train[9:16]  

2151        The table's occupied, I'm waiting by the tree
330     I'm reading the text i just sent you. Its mean...
1276                                 Wot u up 2 u weirdo?
3823                             Just normal only here :)
2026    Yes obviously, but you are the eggs-pert and t...
3315    Oh gei. That happend to me in tron. Maybe ill ...
39      Hello! How's you and how did saturday go? I wa...
Name: Message, dtype: object

In [221]:
# Display a slice of the training labels
Y_train[9:16]  

2151    0
330     0
1276    0
3823    0
2026    0
3315    0
39      0
Name: spam, dtype: int64

In [222]:
# Get the type of X_train.values, which should be a numpy array
type(X_train.values)  

numpy.ndarray

In [223]:
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer for converting text to token counts

v = CountVectorizer()  # Create an instance of CountVectorizer

# Transform the training messages into a document-term matrix
X_train_cv = v.fit_transform(X_train.values)  

# Display the transformed document-term matrix (sparse matrix)
X_train_cv

<4457x7781 sparse matrix of type '<class 'numpy.int64'>'
	with 59054 stored elements in Compressed Sparse Row format>

In [224]:
# Convert the sparse matrix to a dense NumPy array and show the first row of the first two samples
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [225]:
# Display the shape of the document-term matrix
X_train_cv.shape  

(4457, 7781)

In [226]:
# Display feature names from the CountVectorizer, slicing a portion
v.get_feature_names_out()[10:16]  

array(['02073162414', '02085076972', '021', '03', '04', '0430'],
      dtype=object)

In [227]:
# Display the total number of features (vocabulary size)
v.get_feature_names_out().shape

(7781,)

In [228]:
# List all attributes and methods of the CountVectorizer instance
dir(v)  

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [229]:
v.vocabulary_

{'fyi': 3072,
 'back': 1178,
 'in': 3668,
 'my': 4694,
 'parents': 5113,
 'place': 5263,
 'south': 6365,
 'tampa': 6733,
 'so': 6304,
 'might': 4503,
 'need': 4759,
 'to': 6961,
 'do': 2381,
 'the': 6839,
 'deal': 2196,
 'somewhere': 6332,
 'else': 2573,
 'fun': 3058,
 'fact': 2759,
 'although': 918,
 'you': 7742,
 'would': 7649,
 'think': 6870,
 'armand': 1057,
 'eventually': 2675,
 'build': 1548,
 'up': 7214,
 'tolerance': 6978,
 'or': 5004,
 'some': 6318,
 'shit': 6122,
 'considering': 1981,
 'how': 3545,
 'much': 4662,
 'he': 3388,
 'smokes': 6281,
 'gets': 3136,
 'fucked': 3046,
 'like': 4140,
 'hits': 3464,
 'never': 4787,
 'blame': 1367,
 'day': 2188,
 'ur': 7229,
 'life': 4129,
 'good': 3199,
 'days': 2189,
 'give': 3159,
 'happiness': 3355,
 'bad': 1180,
 'experience': 2728,
 'both': 1434,
 'are': 1042,
 'essential': 2656,
 'all': 900,
 'gods': 3182,
 'blessings': 1376,
 'morning': 4616,
 'hey': 3443,
 'elaine': 2563,
 'is': 3774,
 'today': 6967,
 'meeting': 4456,
 'still': 65

In [230]:
# Display the vocabulary dictionary of the CountVectorizer
v.vocabulary_  

{'fyi': 3072,
 'back': 1178,
 'in': 3668,
 'my': 4694,
 'parents': 5113,
 'place': 5263,
 'south': 6365,
 'tampa': 6733,
 'so': 6304,
 'might': 4503,
 'need': 4759,
 'to': 6961,
 'do': 2381,
 'the': 6839,
 'deal': 2196,
 'somewhere': 6332,
 'else': 2573,
 'fun': 3058,
 'fact': 2759,
 'although': 918,
 'you': 7742,
 'would': 7649,
 'think': 6870,
 'armand': 1057,
 'eventually': 2675,
 'build': 1548,
 'up': 7214,
 'tolerance': 6978,
 'or': 5004,
 'some': 6318,
 'shit': 6122,
 'considering': 1981,
 'how': 3545,
 'much': 4662,
 'he': 3388,
 'smokes': 6281,
 'gets': 3136,
 'fucked': 3046,
 'like': 4140,
 'hits': 3464,
 'never': 4787,
 'blame': 1367,
 'day': 2188,
 'ur': 7229,
 'life': 4129,
 'good': 3199,
 'days': 2189,
 'give': 3159,
 'happiness': 3355,
 'bad': 1180,
 'experience': 2728,
 'both': 1434,
 'are': 1042,
 'essential': 2656,
 'all': 900,
 'gods': 3182,
 'blessings': 1376,
 'morning': 4616,
 'hey': 3443,
 'elaine': 2563,
 'is': 3774,
 'today': 6967,
 'meeting': 4456,
 'still': 65

In [231]:
# Convert the sparse matrix to a dense NumPy array for easier manipulation
X_train_np = X_train_cv.toarray()  

# Display the first 4 rows of the dense NumPy array
X_train_np[:4]  

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [232]:
# Find the indices of non-zero entries in the first row of the dense NumPy array
np.where(X_train_np[0]!=0)  

(array([1178, 2196, 2381, 2573, 3072, 3668, 4503, 4694, 4759, 5113, 5263,
        6304, 6332, 6365, 6733, 6839, 6961]),)

In [233]:
from sklearn.naive_bayes import MultinomialNB  # Import MultinomialNB for the classification model

model = MultinomialNB()  # Create an instance of MultinomialNB
model.fit(X_train_cv, Y_train)  # Train the model using the document-term matrix and training labels

# Transform the test messages using the same vectorizer
X_test_cv = v.transform(X_test)  

In [234]:
from sklearn.metrics import classification_report  # Import classification_report to evaluate the model

# Predict the labels for the test set
y_pred = model.predict(X_test_cv)  

# Print the classification report comparing true and predicted labels
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       958
           1       0.95      0.94      0.95       157

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [235]:
# Define a list of new email messages for prediction
email = [
    'hey Bill , can we get together to watch inception  tomorrow?',
    'upto 20% discount on parking exclusive offer, just for you, dont miss this reward'
]

# Transform the new email messages using the same vectorizer
email_count= v.transform(email)

# Predict the labels for the new email messages
model.predict(email_count)  

array([0, 1])

In [236]:
from sklearn.pipeline import Pipeline  # Import Pipeline for creating a workflow combining vectorizer and classifier

# Create a pipeline with CountVectorizer and MultinomialNB
clf = Pipeline([
    ('vectorizer', CountVectorizer()),  # Step 1: Convert text to token counts
    ('nb', MultinomialNB())  # Step 2: Apply MultinomialNB for classification
])

# Fit the pipeline on the training data
clf.fit(X_train, Y_train)  

In [237]:
# Predict the labels for the test set using the pipeline
y_pred = clf.predict(X_test)  

# Print the classification report comparing true and predicted labels from the pipeline
print(classification_report(Y_test, y_pred))  

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       958
           1       0.95      0.94      0.95       157

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115

