In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Kaggle Dataset - https://www.kaggle.com/datasets/thedevastator/sms-spam-collection-a-more-diverse-dataset

df = pd.read_csv('/content/train.csv')

In [None]:
df.head(15)

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [None]:
def lemmatized(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  lemmas = [lemmatizer.lemmatize(token) for token in tokens]
  return' '.join(lemmas)

df['Processed_sms'] = df['sms'].apply(lemmatized)

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  tokens = word_tokenize(text)
  filtered_tokens = []
  for token in tokens:
    if token.lower() not in stop_words:
      filtered_tokens.append(token)
  return' '.join(filtered_tokens)

df['Final_sms'] = df['Processed_sms'].apply(remove_stopwords)

In [None]:
df

Unnamed: 0,sms,label,Processed_sms,Final_sms
0,"Go until jurong point, crazy.. Available only ...",0,"Go until jurong point , crazy .. Available onl...","Go jurong point , crazy .. Available bugis n g..."
1,Ok lar... Joking wif u oni...\n,0,Ok lar ... Joking wif u oni ...,Ok lar ... Joking wif u oni ...
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,U dun say so early hor... U c already then say...,0,U dun say so early hor ... U c already then sa...,U dun say early hor ... U c already say ...
4,"Nah I don't think he goes to usf, he lives aro...",0,"Nah I do n't think he go to usf , he life arou...","Nah n't think go usf , life around though"
...,...,...,...,...
5569,This is the 2nd time we have tried 2 contact u...,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u. U £750 Pound prize...
5570,Will ü b going to esplanade fr home?\n,0,Will ü b going to esplanade fr home ?,ü b going esplanade fr home ?
5571,"Pity, * was in mood for that. So...any other s...",0,"Pity , * wa in mood for that . So ... any othe...","Pity , * wa mood . ... suggestion ?"
5572,The guy did some bitching but I acted like i'd...,0,The guy did some bitching but I acted like i '...,guy bitching acted like 'd interested buying s...


In [None]:
df.drop(columns = ['sms', 'Processed_sms'], inplace = True)

In [None]:
df

Unnamed: 0,label,Final_sms
0,0,"Go jurong point , crazy .. Available bugis n g..."
1,0,Ok lar ... Joking wif u oni ...
2,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,0,U dun say early hor ... U c already say ...
4,0,"Nah n't think go usf , life around though"
...,...,...
5569,1,2nd time tried 2 contact u. U £750 Pound prize...
5570,0,ü b going esplanade fr home ?
5571,0,"Pity , * wa mood . ... suggestion ?"
5572,0,guy bitching acted like 'd interested buying s...


In [None]:
X = df['Final_sms']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [None]:
# model = Pipeline([
   # ('tfidf', TfidfVectorizer()),
   # ('Random Forest Classifier', RandomForestClassifier())
   # ])

# model.fit(X_train, y_train)
# model.score(X_test, y_test)
# y_pred = model.predict(X_test)

In [None]:
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
rf_clf = RandomForestClassifier()
model = rf_clf.fit(X_train_vectorized, y_train)

In [None]:
y_pred = model.predict(X_test_vectorized)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9829596412556054
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       956
           1       1.00      0.88      0.94       159

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

