In [20]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [23]:
# Step 2: Load the data from the CSV file
mail_data = pd.read_csv('/workspaces/codespaces-blank/mail_data.csv').fillna('')


# Step 4: Convert spam and ham labels to 0 and 1
mail_data['Category'] = (mail_data['Category'] == 'spam').astype(int)

# Step 5: Separate the data into texts and labels
X = mail_data['Message']
Y = mail_data['Category']

# Step 6: Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)


In [24]:
# Step 7: Convert text data to feature vectors
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [25]:
# Step 8: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [26]:
# Step 10: Evaluate the model on test data
test_accuracy = accuracy_score(Y_test, model.predict(X_test_features))
print('Accuracy on test data:', test_accuracy)

Accuracy on test data: 0.9659192825112107


In [27]:
# Step 11: Make a prediction on new input mail
input_mail = ["I've been searching for the right words to thank you for this breather. I promise I won't take your help for granted and will fulfill my promise. You have been wonderful and a blessing at all times"]

# Step 12: Convert text to feature vectors
input_data_features = vectorizer.transform(input_mail)

# Step 13: Make prediction
prediction = model.predict(input_data_features)

# Step 14: Print the result
if prediction[0] == 1:
    print('This looks like a Ham (non-spam) mail!')
else:
    print('Warning! This might be a Spam mail!')

