# SMS_SPAM_Classifier

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

### Reading a tab seperated file using pandas

In [2]:
df = pd.read_csv('C:/Users/Abhinav/Desktop/NLP_dataset/smsspamcollection.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
len(df)

5572

### Checking if any null values present

In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

### Search for any blank review

In [6]:
blank = []
for i,lb,msg,leng,punc in df.itertuples():
    if msg.isspace():
        blank.append(i)

In [7]:
blank

[]

### If any blank values present then simply drop all of them.

In [8]:
df.drop(blank,inplace=True)

In [9]:
len(df)

5572

In [10]:
X = df['message']
y = df.label

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [12]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [13]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [14]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

### You can pass the raw text directly to predict the output weather it is spam or ham message

In [15]:
prediction = text_clf.predict(X_test)
prediction

array(['spam', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

### Confusion Matrix

In [22]:
pd.DataFrame(confusion_matrix(y_test,prediction),index=['Ham','Spam'],columns=['Ham','Spam'])

Unnamed: 0,Ham,Spam
Ham,1455,2
Spam,29,186


### Classification Report

In [17]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1457
        spam       0.99      0.87      0.92       215

    accuracy                           0.98      1672
   macro avg       0.98      0.93      0.96      1672
weighted avg       0.98      0.98      0.98      1672



### Accuracy of Model

In [21]:
100*accuracy_score(y_test,prediction)

98.14593301435407

### Example of Ham message

In [19]:
text_clf.predict(['Hey, i will meet you after an hour kind of busy right now.'])

array(['ham'], dtype=object)

### Example of Spam message

In [20]:
text_clf.predict(['Congratulations! you have been selected as a winner. Text WON to 55321 congratulations free entry to the contest.'])

array(['spam'], dtype=object)