In [2]:
# Bag of Words (BoW) Model

#reading the data
import pandas as pd

data = pd.read_csv("spam.csv",encoding="latin-1")

data = data[['v1', 'v2']]
data.columns = ['label', 'message']

data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Data Preprocessing

import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer


corpus=[]
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()
    
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    
    review = ' '.join(review)
    corpus.append(review)

print(corpus[:5])


['go jurong point crazy available bugis n great world la e buffet cine got amore wat', 'ok lar joking wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf life around though']


In [4]:
# Bow Model Creation

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,ngram_range=(1, 2))

X = cv.fit_transform(corpus).toarray()
y = data['label'].values



In [5]:
cv.vocabulary_

{'go': np.int64(1568),
 'point': np.int64(3344),
 'crazy': np.int64(888),
 'available': np.int64(215),
 'bugis': np.int64(426),
 'great': np.int64(1671),
 'world': np.int64(4877),
 'la': np.int64(2121),
 'cine': np.int64(706),
 'got': np.int64(1643),
 'wat': np.int64(4705),
 'ok': np.int64(2935),
 'lar': np.int64(2137),
 'joking': np.int64(2051),
 'wif': np.int64(4793),
 'oni': np.int64(3002),
 'ok lar': np.int64(2946),
 'free': np.int64(1401),
 'entry': np.int64(1194),
 'wkly': np.int64(4848),
 'comp': np.int64(803),
 'win': np.int64(4803),
 'fa': np.int64(1262),
 'cup': np.int64(906),
 'final': np.int64(1327),
 'tkts': np.int64(4334),
 'st': np.int64(4019),
 'may': np.int64(2447),
 'text': np.int64(4230),
 'receive': np.int64(3541),
 'question': np.int64(3460),
 'std': np.int64(4044),
 'txt': np.int64(4444),
 'rate': np.int64(3491),
 'apply': np.int64(151),
 'free entry': np.int64(1410),
 'entry wkly': np.int64(1197),
 'wkly comp': np.int64(4849),
 'cup final': np.int64(907),
 'std t

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = classifier.predict(X_test)


from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Predictions:", y_pred[:10])
print("Actual:", y_test[:10])

Accuracy: 0.9883408071748879
Predictions: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam']
Actual: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam']


In [7]:
# taking inpuut form user 

user_input = input("Enter a message: ")
user_input = re.sub('[^a-zA-Z]', ' ', user_input)
user_input = user_input.lower()
user_input = user_input.split()
user_input = [lemmatizer.lemmatize(word) for word in user_input if not word in set(stopwords.words('english'))]
user_input = ' '.join(user_input)

user_input = cv.transform([user_input]).toarray()
pred_result = classifier.predict(user_input)
print(pred_result)

['ham']


In [8]:
#tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000,ngram_range=(1, 2))
X = cv.fit_transform(corpus).toarray()
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Model Evaluation for TF-IDF
y_pred = classifier.predict(X_test) 
accuracy = accuracy_score(y_test, y_pred)
print("TF-IDF Accuracy:", accuracy)

print("TF-IDF Predictions:", y_pred[:10])
print("TF-IDF Actual:", y_test[:10])



TF-IDF Accuracy: 0.968609865470852
TF-IDF Predictions: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam']
TF-IDF Actual: ['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam']


In [None]:
!pip install gensim
import gensim
from gensim.models import Word2Vec, KeyedVectors
## References: https://stackoverflow.com/questions/46433778/import-googlenews-vectors-negative300-bin

import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

vec_king = wv['king']
vec_king
vec_king.shape
wv['cricket']
wv.most_similar('cricket')
wv.most_similar('happy')
wv.similarity("hockey","sports")
vec=wv['king']-wv['man']+wv['woman']
vec
wv.most_similar([vec])
