In [11]:
# Chapter 4: From Linear Regression to Logistic Regression

# Spam filtering example using logistic regression
import pandas as pd
df = pd.read_csv('SMSSpamCollection', delimiter='\t', header=None)
print(df.head())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [16]:
print('Number of spam messages:', df[df[0] == 'spam'][0].count())
print('Number of ham messages:', df[df[0] == 'ham'][0].count())

Number of spam messages: 747
Number of ham messages: 4825


In [63]:
# Make some predictions
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score

X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
predictions = classifier.predict(X_test)

In [73]:
for prediction in zip(predictions[:10], X_test_raw[:10]):
    print('Prediction: {} Message: {}'.format(prediction[0], prediction[1]))

Prediction: ham Message: Not course. Only maths one day one chapter with in one month we can finish.
Prediction: ham Message: 4 tacos + 1 rajas burrito, right?
Prediction: ham Message: I'm leaving my house now.
Prediction: ham Message: And you! Will expect you whenever you text! Hope all goes well tomo 
Prediction: ham Message: Mah b, I'll pick it up tomorrow
Prediction: ham Message: Sitting ard nothing to do lor. U leh busy w work?
Prediction: ham Message: Thanx 4 the time weÂ’ve spent 2geva, its bin mint! Ur my Baby and all I want is u!xxxx
Prediction: ham Message: Pls come quick cant bare this.
Prediction: ham Message: I don,t think so. You don't need to be going out that late on a school night. ESPECIALLY when the one class you have is the one you missed last wednesday and probably failed a test in on friday
Prediction: ham Message: Sorry pa, i dont knw who ru pa?
