### Use TF-IDF technique to determine the spam messages

In [20]:
import pandas as pd

def load_data(filepath):
    data = pd.read_csv(filepath, usecols = [0,1], encoding='latin-1')
    data.columns = ['label', 'content']
    return data

file = r'C:\\Users\USER\\Desktop\\Text-Mining\\spam_genuine_classification/spam.csv'
data = load_data(file)


In [21]:
# change the label for convenience
for i in range(data.shape[0]):
    if data.iloc[i].label == 'ham':
        data.iloc[i].label = 'genuine'

In [22]:
# split training / test set
import numpy as np

def split_data(data):
    n = int(data.shape[0])
    tmp_train = (np.random.rand(n) >= 0.5)
    return data.iloc[np.where(tmp_train == True)[0]], data.iloc[np.where(tmp_train == False)[0]]

train, test = split_data(data)

#### calculate which terms are important from training set

In [23]:
import re

# size_table: feature matrix size
# ignore: length of the characters less than it will be ignored

def gen_key_list(data, size_table = 200, ignore = 3):
    spam_raw = dict()
    genuine_raw = dict()
    IDF = dict()
    
    for i in range(data.shape[0]):
        finds = re.findall('[A-Za-z]+', data.iloc[i].content)
        if data.iloc[i].label == 'spam':
            for find in finds:
                if len(find) < ignore:
                    continue
                find = find.lower()
                try:
                    spam_raw[find] = spam_raw[find] + 1
                except:
                    spam_raw[find] = spam_raw.get(find, 1)
                    genuine_raw[find] = genuine_raw.get(find, 0)
        else:
            for find in finds:
                if len(finds) < ignore:
                    continue
                find = find.lower()
                try:
                    genuine_raw[find] = genuine_raw[find] + 1
                except:
                    genuine_raw[find] = genuine_raw.get(find, 1)
                    spam_raw[find] = spam_raw.get(find, 0)
        
        word_set = set()
        for find in finds:
            if len(find) < ignore: continue
            find = find.lower()
            if not(find in word_set):
                try:
                    IDF[find] = IDF[find] + 1
                except:
                    IDF[find] = IDF.get(find, 1)
            word_set.add(find)
            
    word = pd.DataFrame(list(zip(genuine_raw.keys(), genuine_raw.values(), spam_raw.values(), IDF.values())))
    word.columns = ['keyword','genuine','spam','IDF']
    word['genuine'] = word['genuine'].astype('float')/train[train['label']=='genuine'].shape[0]
    word['spam'] = word['spam'].astype('float')/train[train['label']=='spam'].shape[0]
    word['IDF'] = np.log10(word.shape[0]/word['IDF'].astype('float'))
    word['genuine_IDF'] = word['genuine']* word['IDF']
    word['spam_IDF'] = word['spam']* word['IDF']
    word['diff'] = word['spam_IDF'] - word['genuine_IDF']
    selected_spam_key = word.sort_values('diff', ascending= False)
    keyword = dict()
    i = 0
    for word in selected_spam_key.head(size_table).keyword:
        keyword.update({word.strip():i})
        i += 1
    return keyword
        

In [24]:
# build a tabu list based on the training data
size_table = 300      # how many features are used to classify spam
word_len_ignored = 3  # ignore those words shorter than this variable
keyword = gen_key_list(data, size_table, word_len_ignored)

#### Turn the training set and test set to feature vector

In [29]:
def convert_content(content, keyword):
    m = len(keyword)
    res = np.int_(np.zeros(m))
    finds = re.findall('[A-Za-z]+', content)
    for find in finds:
        find = find.lower()
        try:
            i = keyword[find]
            res[i] = 1
        except:
            continue
    return res

In [30]:
def raw2feature(train, test, keyword):
    n_train = train.shape[0]
    n_test = test.shape[0]
    m = len(keyword)
    x_train = np.zeros((n_train, m))
    x_test = np.zeros((n_test, m))
    y_train = np.int_(train.label == 'spam')
    y_test = np.int_(test.label == 'spam')
    
    for i in range(n_train):
        x_train[i,:] = convert_content(train.iloc[i].content, keyword)
    
    for i in range(n_test):
        x_test[i,:] = convert_content(test.iloc[i].content, keyword)
    
    return [x_train,y_train],[x_test,y_test]

train, test = raw2feature(train, test, keyword)

#### Train classifier by feature data

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB 

def learn(train):
    model_NB = BernoulliNB()
    model_NB.fit(train[0], train[1])
    Y_hat_NB = model_NB.predict(train[0])
    
    model_RF = RandomForestClassifier(n_estimators = 10, max_depth = None, min_samples_split = 2, random_state = 0)
    model_RF.fit(train[0], train[1])
    Y_hat_RF = model_RF.predict(train[0])
    
    n = np.size(train[1])
    print('Training Accuarcy NBclassifier : {:.2f}％'.format(sum(np.int_(Y_hat_NB==Train[1]))*100./n))
    print('Training Accuarcy RF: {:.2f}％'.format(sum(np.int_(Y_hat_RF==Train[1]))*100./n))
    return model_NB,model_RF


In [33]:
model_NB,model_RF = learn(train)

NameError: name 'Train' is not defined