In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [None]:
df = pd.read_csv("/content/spam.csv", encoding="latin1")

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.shape

(5572, 5)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

In [None]:
df['label']

Unnamed: 0,label
0,ham
1,ham
2,spam
3,ham
4,ham
...,...
5567,spam
5568,ham
5569,ham
5570,ham


In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [None]:
df.duplicated().sum()

403

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4516
1,653


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'RF': rfc,
    'AdaBoost': abc,
    'GBDT':gbdt,
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy,precision

In [None]:
results = {}
for name, clf in clfs.items():
    acc, prec = train_classifier(clf, X_train_tfidf, y_train, X_test_tfidf, y_test)
    results[name] = {'Accuracy': acc, 'Precision': prec}
    print(f"{name} -> Accuracy: {acc * 100:.2f}%, Precision: {prec * 100:.2f}%")

RF -> Accuracy: 97.78%, Precision: 100.00%
AdaBoost -> Accuracy: 89.46%, Precision: 95.00%
GBDT -> Accuracy: 94.68%, Precision: 100.00%


In [None]:
#Save model and vectorizer
with open("spam_classifier.pkl", "wb") as file:
    pickle.dump((vectorizer, clfs), file)

print("Models saved as spam_classifier.pkl")

Models saved as spam_classifier.pkl
