In [8]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split as split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

In [9]:
spam = pd.read_csv( "spambase.csv", index_col = 0)

# Display the dataframe
spam.head()

Unnamed: 0_level_0,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
word_freq_make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,0.21,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,0.38,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,0.31,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [10]:
print(spam.shape)

(4601, 57)


Our data includes 4601 e-mails (rows) and 57 features (columns). Its features are characterized as follows:

* **word_freq_address** - percentage of words in the e-mail that match ADDRESS.
* **char_freq_#**  - percentage of characters in the e-mail that match the symbol '#'.
* **capital_run_length_average** - average lenth of uninterrupted sequences of capital letters.
* **capital_run_length_longest** - length of longest uninterrupted sequence of catipal letters.
* **capital_run_length_total** - total number of capital letters in the email.

In [11]:
# show information about the data
spam.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 4601 entries, 0.0 to 0.0
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_address           4601 non-null   float64
 1   word_freq_all               4601 non-null   float64
 2   word_freq_3d                4601 non-null   float64
 3   word_freq_our               4601 non-null   float64
 4   word_freq_over              4601 non-null   float64
 5   word_freq_remove            4601 non-null   float64
 6   word_freq_internet          4601 non-null   float64
 7   word_freq_order             4601 non-null   float64
 8   word_freq_mail              4601 non-null   float64
 9   word_freq_receive           4601 non-null   float64
 10  word_freq_will              4601 non-null   float64
 11  word_freq_people            4601 non-null   float64
 12  word_freq_report            4601 non-null   float64
 13  word_freq_addresses         46

The data has 1 categorical, and 56 continuous variables.    

The data doesn't have missing values.

📌 Email **spam:** yes=1, no=0.

In [12]:
# Split dataset into training (70%) and testing (30%)
spam_train, spam_test = split(spam, train_size = 0.7, random_state = 1313) 
spam_train.head()

Unnamed: 0_level_0,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
word_freq_make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,1.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.55,0.0,0.0,0.0,0.0,1.333,5,28,0
0.0,0.0,0.23,0.0,0.46,0.0,0.0,0.0,0.23,0.0,0.0,...,0.0,0.113,0.0,0.09,0.0,0.203,2.43,121,666,0
0.0,0.0,0.36,0.0,0.36,0.0,0.0,0.0,0.0,0.0,0.0,...,0.279,0.767,0.139,0.0,0.0,0.0,3.722,20,268,0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.25,2,5,0
0.0,0.0,0.56,0.0,0.08,0.16,0.0,0.0,0.0,0.16,0.0,...,0.164,0.505,0.0,0.01,0.021,0.0,2.729,55,1122,0


In [13]:
# Create target
X = spam_train.drop(['spam'], axis = 1)
y = spam_train['spam']

X_test = spam_test.drop(['spam'], axis = 1)
y_test = spam_test['spam']

### Default metric (Euclidean)

In [14]:
# Create KNN Classifier model for k = 5
k = 5
spam_clf = KNeighborsClassifier(n_neighbors = k)

# Train the model using the training sets
spam_clf.fit(X, y)

print(spam_clf.score(X, y))

0.8680124223602484


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Evaluation Model

In [15]:
# Predict the response for train dataset
train_pred_1 = spam_clf.predict(X)

# Confusion matrix
cm = confusion_matrix(y_true = y, y_pred = train_pred_1)

print(pd.DataFrame(cm, index = ['Not-spam', 'Spam'], columns = ['Not-spam', 'Spam']))
print('-----------------------------------------------------')
print(classification_report(y_true = y, y_pred = train_pred_1)) 

          Not-spam  Spam
Not-spam      1763   194
Spam           231  1032
-----------------------------------------------------
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      1957
           1       0.84      0.82      0.83      1263

    accuracy                           0.87      3220
   macro avg       0.86      0.86      0.86      3220
weighted avg       0.87      0.87      0.87      3220



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [16]:
# Create Logistic Regression Classifier model
spam_clf = LogisticRegression()

# Fit the model using the training set
spam_clf.fit(X, y)

print(spam_clf.score(X, y)) 

0.9226708074534161


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
