In [49]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords

In [50]:
data = pd.read_csv('spam.csv',usecols=[0,1])
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#### PreProcessing

In [51]:
stop_words = set(stopwords.words('english'))
def preprocess(message):
    words = message.lower().split()
    processed = [word for word in words if word not in stop_words]
    return processed

message = preprocess(data.iloc[0,1])  # preprocessed message for the message in first row
message

['go',
 'jurong',
 'point,',
 'crazy..',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet...',
 'cine',
 'got',
 'amore',
 'wat...']

#### Loading the Word2Vec model

In [52]:
model = api.load('word2vec-google-news-300')

#### Converting message to vector 

In [68]:
def avgWord2Vec(message,model):
    valid_vectors = [model[word] for word in message if word in model]

    if not valid_vectors:
        return np.zeros(model.vector_size)
    else:
        return np.mean(valid_vectors,axis=0)
    
messages = []
for i in range(data.shape[0]):
    messages.append(preprocess(data.iloc[i,1]))

X = np.zeros((data.shape[0],300))

for i in range(len(messages)):
    vector = avgWord2Vec(messages[i],model)
    X[i] = vector

Y = data['label']
print('\n=== Feature Vector ===\n')
print(X)
print('\n=== Labels ===\n')
print(Y)


=== Feature Vector ===

[[ 0.0038269   0.04334106  0.00832519 ... -0.08195801 -0.07321777
   0.03902588]
 [-0.0387001   0.08392334  0.07502747 ... -0.16259766 -0.06219482
   0.03955078]
 [ 0.00078852 -0.0298785  -0.0717199  ... -0.12769902 -0.09681567
   0.01087705]
 ...
 [ 0.15901184  0.0958252  -0.00756836 ...  0.10864258  0.05224609
  -0.01599121]
 [ 0.09965633  0.02711839 -0.01533156 ... -0.06056565 -0.00980377
  -0.04541016]
 [ 0.13818359  0.10009766  0.08813477 ... -0.04052734 -0.14939117
  -0.05102539]]

=== Labels ===

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object


#### Train Test Split

In [69]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

#### Training and Accuracy of the Model

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier_model = LogisticRegression()
classifier_model.fit(X_train,Y_train)
Y_Pred = classifier_model.predict(X_test)
print("Accuracy of the Model : ",accuracy_score(Y_test,Y_Pred))

Accuracy of the Model :  0.9381165919282511


#### Testing on new Inputs

In [73]:
def predict_message_class(model,w2v_model,message):
    message = preprocess(message)
    message_vector = avgWord2Vec(message,w2v_model).reshape(1,-1)
    return model.predict(message_vector)[0]

In [74]:
message1 = "Hey, are we still on for the meeting tomorrow at 10 AM? Let me know if anything changes. Cheers!"
predict_message_class(classifier_model,model,message1)

'ham'

In [75]:
message2 = "Congratulations! You've won a free iPhone. Click here to claim your prize now: http://scam-link.com"
predict_message_class(classifier_model,model,message2)

'spam'