In [1]:
import nltk
import pandas as pd
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
import pandas as pd
import nltk

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Set display option for wider text columns
pd.set_option('display.max_colwidth', 100)

# Load the first dataset
messages1 = pd.read_csv('spam.csv', encoding="latin-1")

# Keep only the first two columns and rename
messages1 = messages1.iloc[:, :2]
messages1.columns = ['label', 'text']

# Load the second dataset
messages2 = pd.read_csv('spam_1.csv', encoding="latin-1")

# Keep only the first two columns and rename
messages2 = messages2.iloc[:, :2]
messages2.columns = ['label', 'text']

# Combine both datasets
messages = pd.concat([messages1, messages2], ignore_index=True)

# Check for null values
print('Number of nulls in label:', messages['label'].isnull().sum())
print('Number of nulls in text:', messages['text'].isnull().sum())

# Check the distribution of labels
print(messages['label'].value_counts())

# Define stopwords
stopwords = nltk.corpus.stopwords.words('english')


Number of nulls in label: 0
Number of nulls in text: 0
label
ham     9650
spam    1494
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to C:\Users\Prakash
[nltk_data]     Raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Function to clean text
def cleantext(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\\W+', text)
    text = [word for word in tokens if word.lower() not in stopwords and word]  # Remove empty strings
    return ' '.join(text)

# Vectorize the text
# Disable token_pattern warning by setting it to None (because we are using custom tokenizer)
tfidf = TfidfVectorizer(tokenizer=lambda x: cleantext(x).split(), token_pattern=None)
x_tfidf = tfidf.fit_transform(messages['text'])

# Create DataFrame from TF-IDF features
x_features = pd.DataFrame(x_tfidf.toarray())

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(messages['label'])

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(x_features, y_encoded, test_size=0.2, random_state=42)

In [16]:
# Train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf_model = rf.fit(x_train, y_train)

# Make predictions
y_pred = rf_model.predict(x_test)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, pos_label=1)  
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
print('Precision: {} / Recall: {} / F1-Score: {}'.format(round(precision, 3), round(recall, 3), round(f1, 3)))


Precision: 1.0 / Recall: 0.972 / F1-Score: 0.986


In [None]:
# Test with a new example message
text = ["Hey, are we still on for dinner tonight? Let me know when you're free."]
text_tfidf = tfidf.transform(text)
x_features = pd.DataFrame(text_tfidf.toarray())
y_pred = rf_model.predict(x_features)
predicted_label = label_encoder.inverse_transform(y_pred)  
print('Prediction for "{}": {}'.format(text[0], predicted_label[0]))

   0     1     2     3     4     5     6     7     8     9     ...  9554  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   9555  9556  9557  9558  9559  9560  9561  9562  9563  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[1 rows x 9564 columns]
Prediction for "Hey, are we still on for dinner tonight? Let me know when you're free.": ham
