In [60]:
# import libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix,classification_report



In [4]:
# Load the data
data= pd.read_csv("/content/smsspamcollection.tsv",sep="\t")
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [None]:
# Previously we did classification using length and pucnt columns
# now lets use messages.

In [7]:
# check for missing values

data.isnull().sum()

Unnamed: 0,0
label,0
message,0
length,0
punct,0


In [9]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [13]:
X = data['message']
X.head()

Unnamed: 0,message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
y = data['label']
y.head()

Unnamed: 0,label
0,ham
1,ham
2,spam
3,ham
4,ham


In [17]:
# Split the data in training and test set

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,\
                                               random_state=42)



In [18]:
# Count Vectorization

# So text pre-processing,tokenizing, and the ability to filter out
# stop words are all included in count vectorizer
# which builds a dictionary of features and transforms documents to
# feature vectors.


In [23]:
count_vectorizer =CountVectorizer()

In [29]:
# Fit the vectorizer to the data(build a vocab,count the number of words)

#count_vectorizer.fit(X_train)

#X_train_counts = count_vectorizer.transform(X_train)

# Transform the original text message to the vector.

X_train_counts=count_vectorizer.fit_transform(X_train)
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [31]:
# to view sparse matrix.

X_train.shape

# So 3733 messages in X_train and across all messages there are 7082 unique
# words




(3733,)

In [32]:
X_train_counts.shape

(3733, 7082)

In [None]:
# Next step is transform the counts to frequencies with TF-IDF.
# Then combine the steps with TF-IDF vectorizer train classifer and
# build pipeline



In [36]:
tfidf_transformer= TfidfTransformer()

In [38]:
X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)



In [39]:
X_train_tfidf.shape

(3733, 7082)

In [41]:
vectorizer= TfidfVectorizer()

In [44]:
 # this combine the both Count Vectorization and TDIDF transformation
X_train_tfidf = vectorizer.fit_transform(X_train)



In [45]:
# train a classifier



In [49]:
model = LinearSVC()
model.fit(X_train_tfidf,y_train)

In [None]:
# Only training set has been vectorized into a full vocabulary.

# In order to perform an analysis on our test set, we would actually
# have to then repeat all these same procedures.

# it can be time wasting because if we have a long process.

# we can use pipeline class that behaves like a compound classifier.
# It can perform both vectorization and classification.

# So instead of doing fit_transform and count vectorization on test data
# we can combine all steps in one single pipeline.



In [53]:
pipeline= Pipeline([('tfidf',TfidfVectorizer()),
                    ('model',LinearSVC())])



In [55]:
# fit data in pipeline

pipeline.fit(X_train,y_train)


In [58]:
pred= pipeline.predict(X_test)
pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [61]:
cm = confusion_matrix(y_test,pred)
cm

array([[1586,    7],
       [  12,  234]])

In [69]:
report=classification_report(y_test,pred,output_dict=True)
report


{'ham': {'precision': 0.9924906132665833,
  'recall': 0.9956057752667922,
  'f1-score': 0.9940457536822312,
  'support': 1593.0},
 'spam': {'precision': 0.970954356846473,
  'recall': 0.9512195121951219,
  'f1-score': 0.9609856262833676,
  'support': 246.0},
 'accuracy': 0.989668297988037,
 'macro avg': {'precision': 0.9817224850565281,
  'recall': 0.973412643730957,
  'f1-score': 0.9775156899827995,
  'support': 1839.0},
 'weighted avg': {'precision': 0.989609743729146,
  'recall': 0.989668297988037,
  'f1-score': 0.9896233549110945,
  'support': 1839.0}}

In [70]:
report= pd.DataFrame(report).transpose()
report

Unnamed: 0,precision,recall,f1-score,support
ham,0.992491,0.995606,0.994046,1593.0
spam,0.970954,0.95122,0.960986,246.0
accuracy,0.989668,0.989668,0.989668,0.989668
macro avg,0.981722,0.973413,0.977516,1839.0
weighted avg,0.98961,0.989668,0.989623,1839.0


In [None]:
# model gave better result with vectorization

# accuracy is 98%.


In [71]:
pipeline.predict(['My name is malav joshi. How are you?'])

array(['ham'], dtype=object)

In [75]:
pipeline.predict(['Please provide your details for lottery.\
                    please text "WINNER" '])



array(['spam'], dtype=object)