# Spam 메일 분류 - 머신러닝 기법
- CountVectorizer + NaiveBayes
- TfidfVectorizer + LogisticRegression

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/spam_전처리완료.csv')
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
# 영문자/숫자 이외는 공백으로 변환하고 소문자로 변환
df.v2 = df.v2.str.replace('[^A-Za-z0-9]',' ').str.strip().str.lower()

- Train/Test dataset으로 분리

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2, df.v1, stratify=df.v1, test_size=0.2, random_state=2022
)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

- Case 1) CountVectorizer + NaiveBayes

In [6]:
pipeline1 = Pipeline([ 
    ('cvect', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])
params = {
    'cvect__max_df': [0.9, 0.95],
    'cvect__ngram_range': [(1,1),(1,2)]
}

In [7]:
grid_pipe1 = GridSearchCV(
    pipeline1, params, scoring='accuracy', cv=3, n_jobs=-1
)
grid_pipe1.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('cvect',
                                        CountVectorizer(stop_words='english')),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cvect__max_df': [0.9, 0.95],
                         'cvect__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy')

In [8]:
grid_pipe1.best_params_ 

{'cvect__max_df': 0.9, 'cvect__ngram_range': (1, 2)}

In [9]:
grid_pipe1.best_estimator_.score(X_test, y_test)

0.9854932301740812

- Case 2) TfidfVectorizer + LogisticRegression

In [10]:
pipeline2 = Pipeline([ 
    ('tvect', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression(random_state=2022))
])
params = {
    'tvect__max_df': [0.9, 0.95],
    'tvect__ngram_range': [(1,1),(1,2)],
    'lr__C': [1,10]
}

In [11]:
grid_pipe2 = GridSearchCV(
    pipeline2, params, scoring='accuracy', cv=3, n_jobs=-1
)
%time grid_pipe2.fit(X_train, y_train)

Wall time: 2.42 s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('lr',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [0.9, 0.95],
                         'tvect__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy')

In [12]:
grid_pipe2.best_params_

{'lr__C': 10, 'tvect__max_df': 0.9, 'tvect__ngram_range': (1, 2)}

In [13]:
grid_pipe2.best_estimator_.score(X_test, y_test)

0.9777562862669246