<h1>Word Embedding with practical</h1>

In [16]:
## Importing Libraries

import nltk # natural language toolkit
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import warnings # to ignore warnings
warnings.filterwarnings('ignore') # ignore warnings

## Reading the sam collection data into pandas data frame

In [None]:
df = pd.read_csv('../Assets/spam.csv' , encoding='latin-1' ) # encoding='latin-1' is used to avoid UnicodeDecodeError
columns = ['lable', 'message'] # renaming the columns
df.columns=columns # assigning the new column names to the data
df.head() # checking the first 5 rows of the data


Unnamed: 0,lable,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
## Importing Data Preprocessing Libraries and Downloading Stopwords

import re # regular expression
from nltk.corpus import stopwords # stopwords
from nltk.stem import PorterStemmer # stemming

nltk.download('stopwords') # downloading the stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
corpus = [] # creating an empty list to store the cleaned data

for i in range(0, len(df)):  # iterating through the data
    review = re.sub("[^a-zA-z0-9]", " ", df['message'][i]) # removing special characters
    review = review.lower() # converting the text to lowercase
    review = review.split() # splitting the text into words
    ps = PorterStemmer() # creating an object of the PorterStemmer class
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # stemming and removing stopwords
    review = " ".join(review) # joining the words
    corpus.append(review) # appending the cleaned data to the list
    

In [22]:
corpus[0:5] # checking the first 5 cleaned data

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

## Tranning model with BOW (Bag of Words)

In [38]:
## Creating the Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer # importing CountVectorizer
cv = CountVectorizer(max_features=2500, binary= True, ngram_range=(2,2)) # creating an object of CountVectorizer class
X = cv.fit_transform(corpus) # fitting the data to the model and transforming the data
X = X.toarray() # converting the data to an array
X.shape # checking the shape of the data

(5572, 2500)

In [39]:
## Splitting the Data into Training and Testing Sets

y= pd.get_dummies(df['lable']) # creating dummies of the target variable
y = y.iloc[:,1].values # taking the values of the target variable
y.shape # checking the shape of the target

(5572,)

In [40]:
## Training the Model

from sklearn.model_selection import train_test_split # importing train_test_split
from sklearn.naive_bayes import MultinomialNB # importing MultinomialNB

## Splitting the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) 

spam_detect_model = MultinomialNB().fit(X_train, y_train) # fitting the model on the training data


In [41]:
## Evaluating the Model
y_pred = spam_detect_model.predict(X_test) # predicting the target variable on the test data

In [None]:
## Confusion Matrix
from sklearn.metrics import confusion_matrix # importing confusion_matrix
from sklearn.metrics import classification_report, accuracy_score # importing classification_report
confusion_m = confusion_matrix(y_test, y_pred) # creating the confusion matrix
classification = classification_report(y_test, y_pred) # creating the classification report

print(accuracy_score(y_test, y_pred)) # printing the accuracy score
print(confusion_m) # printing the confusion matrix
print(classification) # printing the classification report

0.9829596412556054
[[946   3]
 [ 16 150]]
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       949
        True       0.98      0.90      0.94       166

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



## Training model with TF_IDF

In [53]:
## Creating the TF-IDF Model

from sklearn.feature_extraction.text import TfidfVectorizer # importing TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, binary= True, ngram_range=(1,2)) #  creating an object of CountVectorizer class
X = tv.fit_transform(corpus) # fitting the data to the model and transforming the data
X = X.toarray() # converting the data to an array
X.shape # checking the shape of the data

(5572, 2500)

In [54]:
## Training the Model

from sklearn.model_selection import train_test_split # importing train_test_split
from sklearn.naive_bayes import MultinomialNB # importing MultinomialNB

## Splitting the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) 

spam_detect_model = MultinomialNB().fit(X_train, y_train) # fitting the model on the training data

In [55]:
## Evaluating the Model
y_pred = spam_detect_model.predict(X_test) # predicting the target variable on the test data

In [56]:
## Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score # importing confusion_matrix
from sklearn.metrics import classification_report # importing classification_report
confusion_m = confusion_matrix(y_test, y_pred) # creating the confusion matrix
classification = classification_report(y_test, y_pred) # creating the classification report

print(accuracy_score(y_test, y_pred)) # printing the accuracy score
print(confusion_m) # printing the confusion matrix
print(classification) # printing the classification report

0.9757847533632287
[[948   1]
 [ 26 140]]
              precision    recall  f1-score   support

       False       0.97      1.00      0.99       949
        True       0.99      0.84      0.91       166

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.97      1115

