In [1]:
# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# downloading necessary libraries from nltk on here
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
#nltk.download('punkt')
#nltk.download('stopwords')

def preprocess_input(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text


In [3]:
data = "https://raw.githubusercontent.com/AliceLiu17/csc448_final/main/data/preprocessed_dataset/preprocessed_english.csv"

df = pd.read_csv(data)
# df.head()

In [4]:
# Initialize CountVectorizer and TfidfVectorizer
cv = CountVectorizer()  # CountVectorizer for another approach
tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # TfidfVectorizer limiting to 3000 features

# Apply TfidfVectorizer on the 'processed_email' column of your DataFrame
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_email'])  # Transform text to TF-IDF features
y = df['label'].values  # Assign labels to 'y'

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20, random_state=45)

# Initialize Random Forest Classifier with specific parameters
rfc = RandomForestClassifier(n_estimators=50, random_state=2)  # Create a RandomForestClassifier with 50 estimators

# Train the Random Forest Classifier on the training data
rfc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rfc.predict(X_test)

# Calculate accuracy and precision of the model
accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
precision = precision_score(y_test, y_pred)  # Calculate precision

In [7]:
import ipywidgets as widgets
from IPython.display import display

# Function to predict if input text is spam or not
def predict_spam(sample_text):
    # Assume preprocess_input() is your function to preprocess the input text
    preprocessed_text = preprocess_input(sample_text)
    
    preprocessed_vectorized = tfidf_vectorizer.transform([preprocessed_text])

    # Assume predict_spam() is your function to predict using your model
    prediction = rfc.predict(preprocessed_vectorized)
    
    if prediction[0] == 1:
        return "This is a SPAM email."
    else:
        return "This is NOT a spam email."

# Creating text input widget for user input
text_input = widgets.Text(
    value='',
    placeholder='Type your sample text here',
    description='Sample Text:',
    disabled=False
)

# Function to handle user input and display prediction
def on_button_click(b):
    sample_text = text_input.value
    prediction = predict_spam(sample_text)
    with output:
        print(prediction)

# Creating a button for prediction
button = widgets.Button(description="Predict")
button.on_click(on_button_click)

# Displaying the widgets
output = widgets.Output()
display(text_input, button, output)

Text(value='', description='Sample Text:', placeholder='Type your sample text here')

Button(description='Predict', style=ButtonStyle())

Output()