In [25]:
import pandas as pd
import numpy as np
import re
import string

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.linear_model import LogisticRegression

In [26]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

train = pd.read_csv(train_file_path, sep='\t', header=None)
test = pd.read_csv(test_file_path, sep='\t', header=None)

In [27]:
# Preprocessing
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [28]:
# Preprocess data
df = pd.concat([train, test], ignore_index=True)
df[1] = df[1].map(remove_URL)
df[1] = df[1].map(remove_punct)
df[1] = df[1].map(remove_stopwords)

y = df[0]
le = LabelEncoder()
y = le.fit_transform(y)

In [29]:
vectorizer = TfidfVectorizer()
X_vect = vectorizer.fit_transform(df[1]).toarray()

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.33, random_state=42)

In [31]:
model = LogisticRegression(random_state=0).fit(X_train, y_train)

In [32]:
model.score(X_test, y_test)

0.9467101685698749

In [33]:
def predict_message(pred_text):
  # preprocessing
  pred_text = remove_URL(pred_text)
  pred_text = remove_punct(pred_text)
  pred_text = remove_stopwords(pred_text)
  pred_text=[pred_text]
  pred_text = vectorizer.transform(pred_text).toarray()
  # predicting on data
  predict = model.predict(pred_text)
  prediction = [predict.item(), 'ham' if predict.item() < 0.5 else 'spam']
  return (prediction)

pred_text = "our new mobile video service is live. just install on your phone to start watching."

prediction = predict_message(pred_text)
print(prediction)

[1, 'spam']


In [34]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

You passed the challenge. Great job!
