In [None]:
import csv

# Define the input and output file paths
input_file = '/content/SMSSpamCollection.txt'
output_file = 'output.csv'

# Define the delimiter used in the input file
delimiter = '\t'  # Assuming tab-separated values

# Open input file in read mode and output file in write mode
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    # Create a CSV writer object
    writer = csv.writer(outfile)

    # Loop through each line in the input file
    for line in infile:
        # Split the line using the delimiter
        fields = line.strip().split(delimiter)

        # Write the fields to the CSV file
        writer.writerow(fields)

print("Conversion complete. CSV file saved as:", output_file)


Conversion complete. CSV file saved as: output.csv


In [None]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [None]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
wordnet=WordNetLemmatizer()
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4000)
X = cv.fit_transform(corpus).toarray()

In [None]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [None]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [None]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [None]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

In [None]:
confusion_m

array([[946,   9],
       [  9, 151]])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_m = accuracy_score(y_test, y_pred)


In [None]:
accuracy_m

0.9838565022421525