In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string

In [2]:
# Download the stopwords dataset from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the dataset
data = pd.read_csv('/content/SMS_test.csv', encoding='latin-1')

In [4]:
# Function to clean text: remove stopwords, punctuation, and convert to lowercase
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    # Remove stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [5]:
# Apply the preprocessing function to the 'OriginalTweet' column
data['cleaned_data'] = data['Message_body'].apply(preprocess_text)

In [6]:
# Define the features (X) and the target (y)
X = data['cleaned_data']
y = data['Label']


In [7]:
# Vectorize the cleaned data using CountVectorizer and TfidfVectorizer
vectorizer1 = CountVectorizer(binary=True)
vectorizer2 = CountVectorizer(binary=False)
vectorizer3 = TfidfVectorizer()


In [8]:
x1 = vectorizer1.fit_transform(X)
x2 = vectorizer2.fit_transform(X)
x3 = vectorizer3.fit_transform(X)

In [9]:

# Split the data into training and testing sets
xtrain1, xtest1, ytrain, ytest = train_test_split(x1, y, random_state=1, test_size=0.25)
xtrain2, xtest2, ytrain, ytest = train_test_split(x2, y, random_state=1, test_size=0.25)
xtrain3, xtest3, ytrain, ytest = train_test_split(x3, y, random_state=1, test_size=0.25)


In [10]:
# Initialize the models
bnb = BernoulliNB()
mnb = MultinomialNB()
mnb2 = MultinomialNB()

In [11]:
# Train the models
bnb.fit(xtrain1, ytrain)
mnb.fit(xtrain2, ytrain)
mnb2.fit(xtrain3, ytrain)

In [12]:
# Make predictions
pred1 = bnb.predict(xtest1)
pred2 = mnb.predict(xtest2)
pred3 = mnb2.predict(xtest3)

In [13]:
# Check the accuracy scores
accuracy1 = accuracy_score(pred1, ytest)
accuracy2 = accuracy_score(pred2, ytest)
accuracy3 = accuracy_score(pred3, ytest)

In [14]:
print(f"Accuracy with BernoulliNB and binary CountVectorizer: {accuracy1}")
print(f"Accuracy with MultinomialNB and non-binary CountVectorizer: {accuracy2}")
print(f"Accuracy with MultinomialNB and TfidfVectorizer: {accuracy3}")


Accuracy with BernoulliNB and binary CountVectorizer: 0.78125
Accuracy with MultinomialNB and non-binary CountVectorizer: 0.9375
Accuracy with MultinomialNB and TfidfVectorizer: 0.6875


In [None]:
# Hence , the accuracy score with multinomialNB and non - binary CountVectorizer is highest which is 0.93
