# Spam detection using Random mForest

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

## Load the data

In [9]:
df = pd.read_csv('sms_spam.csv', encoding_errors='ignore')
df = df[['v2', 'v1']]
df.rename(columns={"v2": "text", "v1": "label"}, inplace=True)
df.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


# Split the dataset into training and testing sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

## TF-IDF

### TF-IDF only

In [15]:
"""
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
train_vectors = tfidf_vectorizer.fit_transform(train_data)

# Transform the testing data
test_vectors = tfidf_vectorizer.transform(test_data)
"""

## TF-IDF + CountVectorizer
It is common to combine TF-IF and Count vectorizer.

In [37]:
vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_train_vectorized.shape

X_test_vectorized = vectorizer.transform(X_test)

In [38]:
X_train_vectorized.shape

(4457, 7704)

## Training the model

In [39]:
# Create a Random Forest classifier
clf = RandomForestClassifier()

# Train the classifier on the TF-IDF transformed training data and corresponding labels
clf.fit(X_train_vectorized, y_train)

In [41]:
y_pred = clf.predict(X_test_vectorized)
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9766816143497757


## Building a pipeline
Instead of performng Vectorization and Training separately, we can combine them in a pipeline.

In [49]:
clf_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier()),
])

# X_train will be vectorized before training
clf_pipeline.fit(X_train, y_train)

# predict
y_pred = clf_pipelinelf_pipeline.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

