In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
df=pd.read_csv('/content/spam.csv')

In [6]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Preprocessing

In [7]:
#converting 'ham' and 'spam' to numerical labels
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

In [10]:
#converting text messages to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X = vectorizer.fit_transform(df['Message'])


Splitting df to test&train

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, df['Category'], test_size=0.2, random_state=42)

Logistic Regression

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9775784753363229


In [14]:
#Example

In [15]:
new_phrase = "HI I miss you :)"
X_new = vectorizer.transform([new_phrase])
prediction = model.predict(X_new)
if prediction[0] == 1:
    print(f"The phrase '{new_phrase}' is predicted as 'spam'.")
else:
    print(f"The phrase '{new_phrase}' is predicted as 'ham'.")

The phrase 'HI I miss you :)' is predicted as 'ham'.


Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9865470852017937


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [21]:
new_phrase = "Claim your money now :DD!"
X_new = vectorizer.transform([new_phrase])
prediction = model.predict(X_new)
if prediction[0] == 1:
    print(f"The phrase '{new_phrase}' is predicted as 'spam'.")
else:
    print(f"The phrase '{new_phrase}' is predicted as 'ham'.")

The phrase 'Claim your money now :DD!' is predicted as 'spam'.


# Adaboosting classifier

In [22]:
from sklearn.ensemble import AdaBoostClassifier

In [23]:
model = AdaBoostClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9775784753363229


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.95      0.88      0.91       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [27]:
new_phrase = "You have won an IPhone, claim your prize now :DDD!"
X_new = vectorizer.transform([new_phrase])
prediction = model.predict(X_new)
if prediction[0] == 1:
    print(f"The phrase '{new_phrase}' is predicted as 'spam'.")
else:
    print(f"The phrase '{new_phrase}' is predicted as 'ham'.")

The phrase 'You have won an IPhone, claim your prize now :DDD!' is predicted as 'spam'.


# KNN Classifier

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9309417040358744


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       966
           1       1.00      0.48      0.65       149

    accuracy                           0.93      1115
   macro avg       0.96      0.74      0.81      1115
weighted avg       0.94      0.93      0.92      1115



In [35]:
new_phrase = "Hi I love you!"
X_new = vectorizer.transform([new_phrase])
prediction = model.predict(X_new)
if prediction[0] == 1:
    print(f"The phrase '{new_phrase}' is predicted as 'spam'.")
else:
    print(f"The phrase '{new_phrase}' is predicted as 'ham'.")

The phrase 'Hi I love you!' is predicted as 'ham'.
