In [32]:
###IMPORTING

In [33]:
import numpy as np
import pandas as pd

In [34]:
###DATA-PREPROCESSING

In [35]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [36]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [37]:
df.value_counts()

label  message                                                                                                                                                                                                                                length  punct
ham    Sorry, I'll call later                                                                                                                                                                                                                 22      2        30
       I cant pick the phone right now. Pls send a message                                                                                                                                                                                    51      1        12
       Ok...                                                                                                                                                                                                                            

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
x = df['message']
y = df['label']

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [41]:
###TEXT-PROCESSING

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [43]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf',LinearSVC())], verbose=False)

In [44]:
text_clf.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [45]:
###PREDICTING

In [46]:
predictions = text_clf.predict(x_test)

In [47]:
###CONFUSION_MATRIX-AND-ACCURACY

In [48]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

In [49]:
pd.DataFrame(confusion_matrix(y_test, predictions), index=['ham', 'spam'], columns=['ham','spam'])

Unnamed: 0,ham,spam
ham,1445,3
spam,10,214


In [50]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [51]:
metrics.accuracy_score(y_test, predictions)

0.9922248803827751

In [52]:
###WORKING
text1 = ["Hey, how's your Machine Learning journey going on? I wish you luck!"]
text2 = ["Congratulations! You've won 1 Crore Rupees. Text WON to 42425 and claim your reward."]

In [54]:
text_clf.predict(text1)

array(['ham'], dtype=object)

In [55]:
text_clf.predict(text2)

array(['spam'], dtype=object)