In [9]:
# train_spam_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import joblib

# Load sample SMS Spam Dataset
url = "C:\\Users\\singh\\Downloads\\data_set\\spam.csv"

# The UCI SMS Spam dataset usually has columns: v1 (label), v2 (message)
data = pd.read_csv(url, encoding="latin-1")

# Rename columns for clarity
data = data.rename(columns={"v1": "label", "v2": "message"})

# Map labels to binary (ham=0, spam=1)
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], 
    data['label_num'], 
    test_size=0.2, 
    random_state=42
)

# Create a pipeline with vectorizer and classifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB()),
])

# Train the model
pipeline.fit(X_train, y_train)

# Save the model pipeline
joblib.dump(pipeline, 'spam_model.pkl')

print("✅ Model trained and saved as spam_model.pkl")


✅ Model trained and saved as spam_model.pkl


In [1]:
import pandas as pd

file_path = r"C:\Users\singh\Downloads\data_set\spam.csv"

# Try this encoding first
data = pd.read_csv(file_path, encoding='latin-1')

# If you know your CSV is comma-separated, do not use sep='\t'
# data = pd.read_csv(file_path, encoding='latin-1', sep=',')

# For tab-separated, you can try sep='\t'
# data = pd.read_csv(file_path, encoding='latin-1', sep='\t')

print(data.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
