## Importing Dependencies

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Data Loading Module

In [8]:
def load_data(file_path):
    # Loading data from csv file to Dataframe
    data = pd.read_csv(file_path)
    
    return data

## Data Preprocessing Module

In [9]:
def preprocess_data(data):
    # Dropping null values
    data = data.dropna()

    # Encoding labels
    # Ham mails: 0
    # Spam mails: 1
    data.loc[data['Category'] == 'ham', 'Category'] = 0
    data.loc[data['Category'] == 'spam', 'Category'] = 1
    
    return data

## Data Splitting Module

In [10]:
def split_dataset(data):
    # Separating data as texts and labels
    X = data['Message']
    Y = data['Category']

    # Splitting data into training set and test set
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=13)
    
    # Converting Y_train and Y_test to integers
    Y_train = Y_train.astype('int')
    Y_test = Y_test.astype('int')
    
    return X_train, X_test, Y_train, Y_test

## Feature Extraction Module

In [11]:
def extract_features(train, test):
    # Transforming text data into feature vectors which can be inputted into the Logistic Regression model
    extractor = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

    train = extractor.fit_transform(train)
    test = extractor.transform(test)

    return train, test, extractor

## Model Training Module

In [12]:
def train_logistic_regression(X, Y):
    # Training a Logistic Regression model with data
    model = LogisticRegression()
    model.fit(X, Y)

    return model

## Model Evaluation Module

In [13]:
def evaluate_model(X, Y, classifier):
    # Predicting using training data and finding accuracy
    Y_prediction = classifier.predict(X)
    Y_accuracy = accuracy_score(Y, Y_prediction)

    return Y_accuracy

## Outputting Module

In [14]:
def output_prediction(prediction):
    if prediction[0] == 0:
        print('The mail was not flagged as spam.')
    else:
        print('The mail was flagged as spam.')

# Building a Predictive System
## - Training and Evaluating Logistic Regression model

In [15]:
data = load_data('mails.csv')

data = preprocess_data(data)

X_train, X_test, Y_train, Y_test = split_dataset(data)

X_train, X_test, extractor = extract_features(X_train, X_test)

model = train_logistic_regression(X_train, Y_train)

print('Accuracy on Training Data: ', evaluate_model(X_train, Y_train, model))

print('Accuracy on Test Data: ', evaluate_model(X_test, Y_test, model))

Accuracy on Training Data:  0.9681400044873233
Accuracy on Test Data:  0.9560538116591928


## - Implementing UI

In [16]:
# Taking input mail from user
input_mail = [input()]

# Converting the text in the mail to feature vectors
input_mail_features = extractor.transform(input_mail)

# Making prediction
prediction = model.predict(input_mail_features)

output_prediction(prediction)

 call on this number


The mail was not flagged as spam.
