In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pickle

In [2]:
#Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.dropna(how="any", axis=1)
data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)
data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Preprocessing Function
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def text_processing(message):
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    nopunctuation = [char for char in message if char not in string.punctuation]
    nopunctuation = ''.join(nopunctuation)
    return ' '.join([word for word in nopunctuation .split() if word.lower() not in STOPWORDS])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BIJAY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data['clean_msg'] = data.message.apply(text_processing)
data


Unnamed: 0,label,message,clean_msg
0,ham,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry wkly comp win FA Cup final tkts 21s...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried contact å£750 Pound prize claim...
5568,ham,Will Ì_ b going to esplanade fr home?,Ì b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",Pity mood Soany suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [5]:
#Split the data
X = data.clean_msg
Y = data.label.map({'ham':0, 'spam':1})

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [6]:
from sklearn.model_selection import GridSearchCV
#create pipeline
pipe = Pipeline([
    ('bow', CountVectorizer()),
    ('tfid', TfidfTransformer()),
    ('model', MultinomialNB())
])


# Define the parameter grid
param_grid = {
    'bow__ngram_range': [(1, 1), (1, 2)],  # unigrams or unigrams + bigrams
    'bow__max_df': [0.75, 0.85, 1.0],
    'bow__min_df': [1, 2, 5],
    'bow__max_features': [None, 5000, 10000],
    'tfid__use_idf': [True, False],
    'model__alpha': [0.1, 0.5, 1.0]
}


#Train the model
# pipe.fit(X_train, Y_train)

# Grid Search train the model
grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, Y_train)



In [7]:
# Print best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Test set performance
y_pred = grid_search.predict(X_test)
print("Test set accuracy:" ,metrics.accuracy_score(Y_test, y_pred))

Best parameters found: {'bow__max_df': 0.75, 'bow__max_features': None, 'bow__min_df': 1, 'bow__ngram_range': (1, 2), 'model__alpha': 0.1, 'tfid__use_idf': True}
Best cross-validation accuracy: 0.9836202080596699
Test set accuracy: 0.9820627802690582


In [8]:
#save the model
with open('data_spam_classifier.pkl', 'wb') as file:
    pickle.dump(grid_search, file)
    

In [9]:
#Load the model
with open('data_spam_classifier.pkl', 'rb') as file:
    loaded_pipe = pickle.load(file)    

In [10]:
# Function to predict user input
def predict_spam(message):
    prediction = loaded_pipe.predict([message])
    return 'spam' if prediction[0] == 1 else 'ham'

# Example usage with user input
user_input = "URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"
print(f"Message: {user_input}")
print(f"Prediction: {predict_spam(user_input)}")

Message: URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18
Prediction: spam
