In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [63]:
df = pd.read_csv('mail_data.csv')

In [64]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
df.shape

(5572, 2)

In [66]:
label = LabelEncoder()

In [68]:
df['mail'] = label.fit_transform(df.Category)

In [69]:
df.head()

Unnamed: 0,Category,Message,mail
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [70]:
df.mail.value_counts()

mail
0    4825
1     747
Name: count, dtype: int64

In [71]:
# label encoding 2
df.loc[df.Category == 'spam', 'Category',] = 0
df.loc[df.Category == 'ham', 'Category',] = 1

In [72]:
df.head()

Unnamed: 0,Category,Message,mail
0,1,"Go until jurong point, crazy.. Available only ...",0
1,1,Ok lar... Joking wif u oni...,0
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,1,U dun say so early hor... U c already then say...,0
4,1,"Nah I don't think he goes to usf, he lives aro...",0


In [73]:
df.mail = df.mail.replace({0:1, 1:0})

In [74]:
df.head()

Unnamed: 0,Category,Message,mail
0,1,"Go until jurong point, crazy.. Available only ...",1
1,1,Ok lar... Joking wif u oni...,1
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,1,U dun say so early hor... U c already then say...,1
4,1,"Nah I don't think he goes to usf, he lives aro...",1


In [75]:
X = df.Message
Y = df.Category

In [76]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [77]:
# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=1)

# Print shapes to verify correct splitting
print("Shapes after splitting - X_train:", X_train.shape, "X_test:", X_test.shape, "Y_train:", Y_train.shape, "Y_test:", Y_test.shape)


Shapes after splitting - X_train: (5014,) X_test: (558,) Y_train: (5014,) Y_test: (558,)


In [34]:
model = LogisticRegression()

In [78]:
# Feature Extraction
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Print shapes to verify correct transformation
print("Feature Shapes - X_train_features:", X_train_features.shape, "X_test_features:", X_test_features.shape)

Feature Shapes - X_train_features: (5014, 7978) X_test_features: (558, 7978)


In [79]:
# Convert Y to integer type
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

# Print first few Y values to verify
print("Y_train sample:", Y_train.head())
print("Y_test sample:", Y_test.head())

Y_train sample: 1733    1
1567    1
558     1
3246    1
1406    1
Name: Category, dtype: int32
Y_test sample: 4226    1
2534    1
472     1
4502    1
3066    1
Name: Category, dtype: int32


In [80]:
# Model training
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# Accuracy on training data
X_train_prediction = model.predict(X_train_features)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy on training data:', training_data_accuracy)

# Accuracy on test data
X_test_prediction = model.predict(X_test_features)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy on test data:', test_data_accuracy)

Accuracy on training data: 0.9696848823294775
Accuracy on test data: 0.967741935483871


In [87]:
# Making a prediction
input_data = ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]

# Transform input data
input_data_features = feature_extraction.transform(input_data)
prediction = model.predict(input_data_features)

print('Prediction:', prediction)
if prediction[0] == 0:
    print('The message is spam')
else:
    print('The message is ham')


Prediction: [1]
The message is ham
