In [2]:
import numpy as npp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# Data Collection and Pre Processing
# 1 Load Data from csv file to a pandas datafram
df = pd.read_csv('D:\Machine Learning Projects\mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Replace the null values with a null string
df=df.where(pd.notnull(df),'')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.loc[df['Category'] == 'spam', 'Category'] = 0
df.loc[df['Category'] == 'ham', 'Category'] = 1

In [10]:
# Seperating the text as texts and label
X=df['Message']
Y=df['Category']

In [11]:
# Split Dataset 

X_train, X_test, y_train, y_test= train_test_split(X,Y, test_size=0.2, random_state=42)

In [12]:
# Feature Extraction 

# Transform text data to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Fit and transform the training data
X_train_feature= feature_extraction.fit_transform(X_train)

X_test_feature= feature_extraction.transform(X_test)

In [13]:
#  Convert Y_train and T_test as Integers

Y_Train = y_train.astype('int')
Y_test = y_test.astype('int')

In [14]:
# Training the Model
# Logistic Regression
model = LogisticRegression()
model.fit(X_train_feature,Y_Train)

In [15]:
# Evaluating the Trained Model
# Predition on Training Model
pred=model.predict(X_train_feature)
accuracy_on_training_data = accuracy_score(Y_Train,pred)

In [16]:
print("Accuracy for Training : ",accuracy_on_training_data * 100)

Accuracy for Training :  96.61207089970833


In [17]:
#  Building a Predictable System
input_mail = ["""Hi future Data Scientist,
We’re nearing the end of our anniversary sale, and this is your last opportunity to join the transformative 
              SuperDataScience 3.0 at the reduced price of $207 for an entire year. After today, the price 
              is back to full price. SuperDataScience 3.0 isn't just about learning; it's about becoming part 
              of a bigger movement in the tech world. Experts predict a 40% job growth in AI and ML by 2027. 
              Do you want to miss it?Join hundreds of data scientists who are enhancing their 
              careers with our platform. Tomorrow, the cost to access this world-class education
               will increase, but your chance to excel remains—only more expensive. Make the smart 
              choice; invest in your future now.
Sign up now
Don’t let this moment pass. The future of data science is vibrant and filled with 
              opportunites, but only for those who are prepared.
"""]

# Convert Text to feature vectors
input_data_feature = feature_extraction.transform(input_mail)

# Making Prediction
prediction = model.predict(input_data_feature)

print(prediction)

if(prediction == [1]):
    print("This is the Ham Mail.")
else:
    print("This is the Spam Mail.")

[1]
This is the Ham Mail.
