In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
df = pd.read_csv("spam.csv", encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Drop the unnecessary columns
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns={'v1':'label', 'v2':'text'})

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [27]:
# Example data cleanup
df['v2'] = df['v2'].str.lower()  # Convert to lowercase
df['v2'] = df['v2'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation


In [9]:
# Vectorization with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')


In [28]:
# Create feature vectors
X = vectorizer.fit_transform(df['v2'])

# Labels
y = df['v1']

In [11]:
df['v2']

Unnamed: 0,v2
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...
...,...
5567,this is the 2nd time we have tried 2 contact u...
5568,will ì_ b going to esplanade fr home
5569,pity was in mood for that soany other suggest...
5570,the guy did some bitching but i acted like id ...


In [29]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Check the shapes of the resulting sets
print("Training set size:", X_train.shape)
print("Test set size:", y_train.shape)


Training set size: (4457, 9259)
Test set size: (4457,)


In [30]:
# Model training
model = LogisticRegression()
model.fit(X_train, y_train)


In [18]:
# Evaluation
predictions = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, predictions))

Accuracy: 0.9399103139013453


### **Predictions**

In [26]:
# Sample new messages
new_messages = ["Congratulations! You've won a free ticket to the Bahamas.",
                "Can we meet tomorrow at 10 AM for the project discussion?"]

# Preprocess the new messages (same as before)
new_messages_cleaned = [msg.lower().replace(r'[^\w\s]', '') for msg in new_messages]

# Transform the new messages using the same TF-IDF vectorizer
new_messages_vectors = vectorizer.transform(new_messages_cleaned)

# Use the trained model to predict
predictions = model.predict(new_messages_vectors)

# Print the predictions
for msg, pred in zip(new_messages, predictions):
    print(f"Message: {msg} \nPredicted label: {pred}\n")


Message: Congratulations! You've won a free ticket to the Bahamas. 
Predicted label: spam

Message: Can we meet tomorrow at 10 AM for the project discussion? 
Predicted label: ham



In [32]:
import pickle

# Assume 'model' is your trained machine learning model
# Assume 'tokenizer' is your trained TF-IDF vectorizer or any other tokenizer

# Save the model to a file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the tokenizer (e.g., TF-IDF vectorizer) to a file
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(vectorizer, tokenizer_file)

print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!
