In [1]:
# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_input(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Convert to lowercase
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text


[nltk_data] Downloading package punkt to /Users/lilyliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilyliang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = "https://raw.githubusercontent.com/AliceLiu17/csc448_final/main/data/preprocessed_dataset/preprocessed_english.csv"

df = pd.read_csv(data)
df.head()

Unnamed: 0,label,email,processed_email
0,0,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazy', '..', 'avai..."
1,1,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entry', '2', 'wkly', 'comp', 'win', ..."
2,0,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'early', 'hor', '...', 'u'..."
3,0,"Nah I don't think he goes to usf, he lives aro...","['nah', ""n't"", 'think', 'goes', 'usf', 'lives'..."
4,1,FreeMsg Hey there darling it's been 3 week's n...,"['freemsg', 'hey', 'darling', ""'s"", '3', 'week..."


In [3]:
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features = 3000)

In [4]:
X = tfid.fit_transform(df['processed_email'])
y = df['label'].values

In [10]:
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 45)

In [14]:
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )  

In [15]:
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

In [16]:
accuracy

0.9774086378737542

In [17]:
precision

0.9649122807017544

In [22]:
import ipywidgets as widgets
from IPython.display import display

# Function to predict if input text is spam or not
def predict_spam(sample_text):
    # Assume preprocess_input() is your function to preprocess the input text
    preprocessed_text = preprocess_input(sample_text)
    
    # Assume predict_spam() is your function to predict using your model
    prediction = rfc.predict([preprocessed_text])
    
    if prediction[0] == 1:
        return "This is a SPAM email."
    else:
        return "This is NOT a spam email."

# Creating text input widget for user input
text_input = widgets.Text(
    value='',
    placeholder='Type your sample text here',
    description='Sample Text:',
    disabled=False
)

# Function to handle user input and display prediction
def on_button_click(b):
    sample_text = text_input.value
    prediction = predict_spam(sample_text)
    print(prediction)

# Creating a button for prediction
button = widgets.Button(description="Predict")
button.on_click(on_button_click)

# Displaying the widgets
display(text_input, button)


Text(value='', description='Sample Text:', placeholder='Type your sample text here')

Button(description='Predict', style=ButtonStyle())

ValueError: could not convert string to float: 'sell fruit'