In [1]:
# this notebook was taken from Lily's Notebook: Voila Implementation.ipynb
# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# taken from Alice's Notebook: alice/EDA_preprocessing_Alice.ipynb
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# downloading necessary libraries from nltk on here (Emily) -> so that these libraries+functions can run on my end
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('stopwords')

# previously from our preprocessing data function 
def preprocess_input(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text


[nltk_data] Downloading package punkt to /Users/emilykim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emilykim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read the new preprocessed dataset
data = "https://raw.githubusercontent.com/AliceLiu17/csc448_final/main/data/preprocessed_dataset/preprocessed_english.csv"

df = pd.read_csv(data)
#df.head()

In [4]:
# vectorizing text data and splitting into train/test sets
cv = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_email'])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20, random_state=45)

# training a RandomForestClassifier to predict labels.
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )  

rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

In [5]:
import ipywidgets as widgets
from IPython.display import display

# Function to predict if input text is spam or not
def predict_spam(sample_text):
    # Assume preprocess_input() is your function to preprocess the input text
    preprocessed_text = preprocess_input(sample_text)
    
    preprocessed_vectorized = tfidf_vectorizer.transform([preprocessed_text])

    # Assume predict_spam() is your function to predict using your model
    prediction = rfc.predict(preprocessed_vectorized)
    
    if prediction[0] == 1:
        return "This is a SPAM email."
    else:
        return "This is NOT a spam email."

# Creating text input widget for user input
text_input = widgets.Text(
    value='',
    placeholder='Type your sample text here',
    description='Sample Text:',
    disabled=False
)

# Function to handle user input and display prediction
def on_button_click(b):
    sample_text = text_input.value
    prediction = predict_spam(sample_text)
    # EMILY included: to fix the button so that it can show the output of whether it is or not spam
    with output:
        print(prediction)

# Creating a button for prediction
button = widgets.Button(description="Predict")
button.on_click(on_button_click)

# Displaying the widgets
output = widgets.Output() # EMILY included
display(text_input, button, output) # EMILY edited

Text(value='', description='Sample Text:', placeholder='Type your sample text here')

Button(description='Predict', style=ButtonStyle())

Output()