In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('smsspamcollection.tsv',sep='\t')
df.sample(5)

Unnamed: 0,label,message,length,punct
5346,ham,"My Parents, My Kidz, My Friends n My Colleague...",130,22
2100,spam,"SMS SERVICES. for your inclusive text credits,...",159,15
4824,ham,:-) :-),7,6
1689,ham,Nan sonathaya soladha. Why boss?,32,2
3742,spam,2/2 146tf150p,13,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

In [4]:
X = df['message']
Y = df['label']
XTR,XTE,YTR,YTE = train_test_split(X,Y,train_size=0.8,random_state=7)

In [5]:
count_vec = CountVectorizer()
count_vec.fit(XTR)
X_train_counts = count_vec.transform(XTR)

In [6]:
X_train_counts = count_vec.fit_transform(XTR)

In [7]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [8]:
X_train_tfidf.shape

(4457, 7789)

In [9]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(XTR)
X_train_tfidf.shape

(4457, 7789)

In [10]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,YTR)

LinearSVC()

In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(XTR,YTR)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [12]:
y_pred = text_clf.predict(XTE)

In [13]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(YTE,y_pred))
print(classification_report(YTE,y_pred))

[[967   1]
 [ 12 135]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       968
        spam       0.99      0.92      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [14]:
text_clf.score(XTE,YTE)

0.9883408071748879

In [15]:
accuracy_score(YTE,y_pred)

0.9883408071748879

In [16]:
text_clf.predict(['YOUR TEXT HERE'])

array(['spam'], dtype=object)

In [17]:
text_clf.predict(['Let"s go!!'])

array(['ham'], dtype=object)