In [11]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor
from HW2_kd538_knnclass import *

In [12]:
# Import data (57 predictors & 1 response variable), standardize and impute missing 'NaN' values
col_names = "word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_will,word_freq_people,word_freq_report,word_freq_addresses,word_freq_free,word_freq_business,word_freq_email,word_freq_you,word_freq_credit,word_freq_your,word_freq_font,word_freq_000,word_freq_money,word_freq_hp,word_freq_hpl,word_freq_george,word_freq_650,word_freq_lab,word_freq_labs,word_freq_telnet,word_freq_857,word_freq_data,word_freq_415,word_freq_85,word_freq_technology,word_freq_1999,word_freq_parts,word_freq_pm,word_freq_direct,word_freq_cs,word_freq_meeting,word_freq_original,word_freq_project,word_freq_re,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total".split(',')
CRLA = 'capital_run_length_average'
imp_cols = [name for name in col_names if name != CRLA]

def knn_imputer(df):
    global CRLA
    row_selector = np.isnan(df[CRLA])
    df.loc[:, imp_cols] = scale(df.loc[:, imp_cols])  # standardize all predictor columns except CRLA
    imp_train, imp_target = df.loc[~row_selector], df.loc[row_selector]  # split either train or test set by row
    knn_reg = KNeighborsRegressor(n_neighbors=15)  # using 'uniform' weighting option by default
    df.loc[row_selector, CRLA] = knn_reg.fit(imp_train[imp_cols], imp_train[CRLA]).predict(imp_target[imp_cols])
    df.loc[:, CRLA] = scale(df.loc[:, CRLA])  # after filling in predictor CRLA's missing values, scale it
    
df_train, df_test = pd.read_csv('spam_train.csv'), pd.read_csv('spam_test.csv')
knn_imputer(df_train), knn_imputer(df_test)

(None, None)

In [14]:
# Create two kNN prediction results on test set
knn_pred1 = knnclass(df_train[imp_cols].values, df_test[imp_cols].values, df_train['spam'].values)
knn_pred2 = knnclass(df_train[col_names].values, df_test[col_names].values, df_train['spam'].values)

Finished validating kNN with k = 1 using 15.85199499130249 s
Finished validating kNN with k = 2 using 14.638240098953247 s
Finished validating kNN with k = 3 using 15.665126085281372 s
Finished validating kNN with k = 4 using 14.387913942337036 s
Finished validating kNN with k = 5 using 14.571433067321777 s
Finished validating kNN with k = 6 using 14.677398204803467 s
Finished validating kNN with k = 7 using 13.794611930847168 s
Finished validating kNN with k = 8 using 14.232501029968262 s
Finished validating kNN with k = 9 using 13.881855010986328 s
Finished validating kNN with k = 2 using 44.041000843048096 s
Finished validating kNN with k = 1 using 14.431267023086548 s
Finished validating kNN with k = 2 using 14.893342018127441 s
Finished validating kNN with k = 3 using 14.412198066711426 s
Finished validating kNN with k = 4 using 15.320875883102417 s
Finished validating kNN with k = 5 using 14.919965028762817 s
Finished validating kNN with k = 6 using 14.661801815032959 s
Finished 

In [13]:
# Create two logistic regression results on test set
lr = LogisticRegression()
logm_pred1 = lr.fit(df_train[imp_cols], df_train['spam']).predict(df_test[imp_cols])
logm_pred2 = lr.fit(df_train[col_names], df_train['spam']).predict(df_test[col_names])
coef_table = DataFrame({'Predictors': col_names, 'Coef': lr.coef_[0]}).sort_values(by='Coef', axis=0)
print(coef_table)

# The most significant variables are 'word_freq_george', 'word_freq_hp', 'capital_run_length_longest', 
# 'word_freq_meeting', 'word_freq_edu' and 'word_freq_cs', from the following display of coefficients DataFrame.

# It is interesting to see 'word_freq_george' has a great impact on classifying whether the email is spam or not,
# while frequency of some words regarding technology connotations (hp, cs, edu, conference) also helps to distinguish;
# information about capital letters are mostly significant, recalling basic knowledges of emotional analysis.

        Coef                  Predictors
26 -3.485758            word_freq_george
24 -2.215901                word_freq_hp
41 -1.340466           word_freq_meeting
45 -1.185862               word_freq_edu
40 -1.153077                word_freq_cs
28 -0.963020               word_freq_lab
47 -0.925161        word_freq_conference
25 -0.905587               word_freq_hpl
44 -0.831717                word_freq_re
43 -0.805362           word_freq_project
34 -0.619084                word_freq_85
30 -0.569292            word_freq_telnet
38 -0.458678                word_freq_pm
32 -0.424300              word_freq_data
48 -0.297504                 char_freq_;
1  -0.229850           word_freq_address
39 -0.227290            word_freq_direct
31 -0.220676               word_freq_857
42 -0.151667          word_freq_original
50 -0.138896                 char_freq_[
29 -0.124300              word_freq_labs
37 -0.116096             word_freq_parts
33 -0.111995               word_freq_415
11 -0.089411    

In [16]:
# Result combining & outporting to csv
DataFrame({'capital_run_length_average': df_test[CRLA], 
           'knn_pred1': knn_pred1, 
           'knn_pred2': knn_pred2, 
           'logm_pred1': logm_pred1, 
           'logm_pred2': logm_pred2}).to_csv('HW2_kd538_results.csv')