In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

### Perform Imports and Load Data
For this exercise we'll be using the **SMSSpamCollection** dataset from [UCI datasets](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection) that contains more than 5 thousand SMS phone messages.<br>You can check out the [**sms_readme**](../TextFiles/sms_readme.txt) file for more info.

The file is a [tab-separated-values](https://en.wikipedia.org/wiki/Tab-separated_values) (tsv) file with four columns:
> **label** - every message is labeled as either ***ham*** or ***spam***<br>
> **message** - the message itself<br>
> **length** - the number of characters in each message<br>
> **punct** - the number of punctuation characters in each message

In [2]:


df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


### Perform EDA

Checking for missing values

In [3]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [4]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [11]:
df['length'].describe()

count    5572.000000
mean       80.489950
std        59.942907
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: length, dtype: float64

In [12]:
df.groupby('label')['length'].mean()

label
ham      71.482487
spam    138.670683
Name: length, dtype: float64

In [13]:
df['punct'].describe()

count    5572.000000
mean        4.177495
std         4.623919
min         0.000000
25%         2.000000
50%         3.000000
75%         6.000000
max       133.000000
Name: punct, dtype: float64

In [14]:
df.groupby('label')['punct'].mean()

label
ham     3.939896
spam    5.712182
Name: punct, dtype: float64

### Perform Train Test Split

In [22]:
X = df[['length', 'punct']]
Y = df[['label']]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [24]:
print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (3733, 2)
Testing Data Shape:  (1839, 2)


In [21]:
y_train = train_data[['label']]
y_test = test_data[['label']]

### Train a Logistic Regression classifier


In [26]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
y_predicted = lr_model.predict(X_test)

In [36]:
# You can make the confusion matrix less confusing by adding labels:
df = pd.DataFrame(metrics.confusion_matrix(y_test,y_predicted),
                  index=['ham_actual','spam_actual'],
                  columns=['ham_predicted','spam_predicted'])
df

Unnamed: 0,ham_predicted,spam_predicted
ham_actual,1551,42
spam_actual,241,5


<font color=green>These results are terrible! More spam messages were confused as ham (241) than correctly identified as spam (5), although a relatively small number of ham messages (46) were confused as spam.</font>

In [38]:
# Print a classification report
print(metrics.classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.92      1593
        spam       0.11      0.02      0.03       246

    accuracy                           0.85      1839
   macro avg       0.49      0.50      0.48      1839
weighted avg       0.76      0.85      0.80      1839



In [40]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,y_predicted))

0.8461120174007613


<font color=green>This model performed *worse* than a classifier that assigned all messages as "ham" would have!</font>

### Train a naïve Bayes classifier:
One of the most common - and successful - classifiers is [naïve Bayes](http://scikit-learn.org/stable/modules/naive_bayes.html#naive-bayes).

In [51]:
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

y_prediction_nb = model_nb.predict(X_test)

  y = column_or_1d(y, warn=True)


In [53]:
# You can make the confusion matrix less confusing by adding labels:
df = pd.DataFrame(metrics.confusion_matrix(y_test,y_prediction_nb),
                  index=['ham_actual','spam_actual'],
                  columns=['ham_predicted','spam_predicted'])
df

Unnamed: 0,ham_predicted,spam_predicted
ham_actual,1583,10
spam_actual,246,0


In [54]:
print(metrics.classification_report(y_test,y_prediction_nb))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.93      1593
        spam       0.00      0.00      0.00       246

    accuracy                           0.86      1839
   macro avg       0.43      0.50      0.46      1839
weighted avg       0.75      0.86      0.80      1839



In [56]:
print(metrics.accuracy_score(y_test, y_prediction_nb))

0.8607939097335509


<font color=green>Better, but still less accurate than 86.6%</font>