In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re as re
from bs4 import BeautifulSoup


# Function to remove tags
def remove_tags(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def clean_special_characters(message):
    characters_to_remove = ['-', '<', '$', '*','\n','>']
    for char in characters_to_remove:
        message = message.replace(char, '')
    return message



# Load the datasets
training_data_path = 'Email_Dataset.csv'  
test_data_path = 'Email Test Data.csv'    

emails_df = pd.read_csv(training_data_path)
test_data_df = pd.read_csv(test_data_path)

emails_df['MESSAGE']=emails_df["MESSAGE"].apply(remove_tags).apply(clean_special_characters)
test_data_df['MESSAGE']=test_data_df['MESSAGE'].apply(remove_tags).apply(clean_special_characters)



# Preprocess the email content using TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(emails_df['MESSAGE']).toarray()

# Setup the target variable
y = emails_df['CATEGORY'].apply(lambda x: 1 if x.lower() == 'spam' else 0)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
log_reg = LogisticRegression(max_iter=1000) 
log_reg.fit(X_train, y_train)

# Evaluate the model
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Training Accuracy: {train_accuracy}')
print(f'Testing Accuracy: {test_accuracy}')

# Preprocess the test data
X_test_data = vectorizer.transform(test_data_df['MESSAGE']).toarray()

# Make predictions on the test data
test_data_predictions = log_reg.predict(X_test_data)

# Save the predictions
test_data_df['spam'] = test_data_predictions
#output_path = 'Q7predictspamornot.csv'  # Update this path as needed
#test_data_df.to_csv(output_path, index=False)




  soup = BeautifulSoup(html, "html.parser")
  soup = BeautifulSoup(html, "html.parser")


Training Accuracy: 0.986875
Testing Accuracy: 0.975
