# Necessary Libraries


In [15]:
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [16]:
 # my file contain some characters that cannot be encoded so that's why i use encoding = cp1252
 # you can also use encoding='ISO-8859-1'

data = pd.read_csv('/content/spam.csv', encoding='cp1252')

In [17]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [18]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

There are a lot of missing values in unnamed 2, 3 & 4

so the best practice is to removes these values.

In [19]:
# this is the code for removing the columns in the dataset
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [20]:
data.shape

(5572, 2)

In [21]:
# renaming the cols
data.rename(columns={'v1':'target','v2':'text'},inplace=True)
data.sample(5)

Unnamed: 0,target,text
1499,spam,"SMS. ac JSco: Energy is high, but u may not kn..."
1503,ham,Ill be there on &lt;#&gt; ok.
1265,ham,Im in inperialmusic listening2the weirdest tra...
4217,ham,Actually i'm waiting for 2 weeks when they sta...
4429,ham,Yar lor... How u noe? U used dat route too?


In [22]:
encoder = LabelEncoder()

In [23]:
data['target'] = encoder.fit_transform(data['target'])

In [24]:
data.duplicated().sum()

403

In [25]:
data = data.drop_duplicates(keep='first')

**We have a column with the text values so machine only understand numeric value so that why i am doing vectorization to convert text into a numeric values.**


In [26]:
# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])
y = data['target']

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
# Train a Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [29]:
# Make predictions
y_pred = clf.predict(X_test)

**The classification report includes precision, recall, and F1-score.**

In [30]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1312   19]
 [  40  180]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1331
           1       0.90      0.82      0.86       220

    accuracy                           0.96      1551
   macro avg       0.94      0.90      0.92      1551
weighted avg       0.96      0.96      0.96      1551

