#### Loading Laibraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

#### Load & read the data file

In [2]:
df = pd.read_csv('./SMSSpamCollection', delimiter='\t', header=None)
print(df.head())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
print('Number of spam messages: %s' % df[df[0] == 'spam'][0].count())
print('Number of ham messages: %s' % df[df[0] == 'ham'][0].count())

Number of spam messages: 747
Number of ham messages: 4825


In [5]:
X = df[1].values
y = df[0].values

In [6]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y) 

##### method for text vectorization, which transforms text data into a numerical representation called TF-IDF (Term Frequency-Inverse Document Frequency) vectors.

In [7]:
#  creates an instance of the TfidfVectorizer class with default settings

vectorizer = TfidfVectorizer()

#  fits the TfidfVectorizer on the training data X_train_raw and then transforms the raw text data
# into a numerical matrix X_train

X_train = vectorizer.fit_transform(X_train_raw) 

# transforms the raw text data in X_test_raw into a numerical matrix X_test using the vocabulary
# learned from the training data.

X_test = vectorizer.transform(X_test_raw)

#### fits a logistic regression classifier to the training data X_train along with
#### their corresponding labels y_train

In [8]:
#  creates an instance of the logistic regression classifier with default settings

classifier = LogisticRegression()

# rains the logistic regression classifier on the training data X_train and their corresponding labels y_train

classifier.fit(X_train, y_train)

In [9]:
# trained logistic regression classifier to make predictions on the test data X_test

predictions = classifier.predict(X_test) 

In [10]:
for i, prediction in enumerate(predictions[:5]): #  loop iterates over the first five elements of the predictions array
    
    #  prints out the information for each test sample in a formatted manner
    print('Predicted: %s, message: %s --- ture label: %s' % (prediction, X_test_raw[i], y_test[i]))

Predicted: ham, message: Ok. But i finish at 6. --- ture label: ham
Predicted: ham, message: I need you to be in my strong arms... --- ture label: ham
Predicted: ham, message: No dear i do have free messages without any recharge. Hi hi hi --- ture label: ham
Predicted: ham, message: House-Maid is the murderer, coz the man was murdered on  &lt;#&gt; th January.. As public holiday all govt.instituitions are closed,including post office..understand? --- ture label: ham
Predicted: spam, message: FreeMSG You have been awarded a FREE mini DIGITAL CAMERA, just reply SNAP to collect your prize! (quizclub Opt out? Stop 80122300p/wk SP:RWM Ph:08704050406) --- ture label: spam


In [11]:
# gives us the TF-IDF score of the 2150th term in the second test document

X_test[1, 2150] 

0.0

In [None]:
#

np.where(X_test[1].todense() != 0) 