In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Data collection and Preprocessing

In [None]:
mail = pd.read_csv('mail_data.csv')
mail.head()

In [None]:
mail.isnull().sum()

In [None]:
mail.shape

In [None]:
# label Encoding on category column
# spam - 0
# ham - 1

In [None]:
mail.replace({'Category': {'spam' :0, 'ham' :1}}, inplace=True)

In [None]:
mail.head()

In [None]:
# seperating 2 columns

In [None]:
X = mail['Message']
Y = mail['Category']


In [None]:
print(X)

In [None]:
print(Y)

In [None]:
# splitting test data and training data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
# feature extraction

In [None]:
# transform the test data to feature vectors that can be used as input to the logistic regression
# TF-IDF :- 1) Term Frequency-Inverse Document Frequency
#           2) it is used to count the number of times each word appears in a documents

In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [None]:
X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)
# in X_test_features 
# fit will not come because we dont want to look our model on the x_test data

In [None]:
print(X_train_feature)

In [None]:
print(X_test_feature)

In [None]:
# converting Y_train and Y_test into integer

In [None]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(Y_train)

In [None]:
# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
# training the LR model with training data

In [None]:
model.fit(X_train_feature, Y_train)

In [None]:
# evaluating the model

In [None]:
print('Accuracy: ', model.score(X_train_feature, Y_train)*100)

In [None]:
# prediction on test data

In [None]:
print('Accuracy: ', model.score(X_test_feature, Y_test)*100)

In [None]:
# building a predictive system 

In [None]:
input_mail = ["Nah I don't think he goes to usf, he lives around here though"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

mail_prediction = model.predict(input_data_features)
print(mail_prediction)

if mail_prediction == 1:
    print("It is an ham mail")
    
else:
    print("It is an spam mail")
    