### `Spam Or Ham`

#### Data Loading

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import urllib.request
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv',encoding='latin1')

In [3]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])   #Ham 이 0, spam 이 1
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


### Data preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5572 non-null   object
 1   isSpam  5572 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [5]:
#결측치
data.isnu=ll().values.any()

False

In [6]:
#중복되는것 제거
data.drop_duplicates(subset=['text'], inplace=True)

In [7]:
data

Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will Ì_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [8]:
import re

def preprocess(string: str, *args, **kwargs) -> str:
    from nltk.stem.porter import PorterStemmer #어간 추출
    from nltk.corpus import stopwords

    string = data.text
    string = string.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress') #이메일 한 번에 처리
    string = string.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress') #웹주소 한 번에 처리
    string = string.str.replace(r'£|\$', 'moneysymb') #이런건 왜 나올까
    string = string.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr') #휴대폰 번호 한 번에 처리
    string = string.str.replace('[^a-zA-Z]', ' ') #영어 말고다 버려
    string= string.str.lower() #소문자
    
    ##불용어 제거
    stop_words = set(stopwords.words('english'))
    string = string.apply(lambda x: ' '.join(word for word 
                                             in x.split() if word not in stop_words))
    ###어간만!              
    ps = PorterStemmer()
    final_processed = string.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
    
    print(final_processed)
    return final_processed

In [9]:
final_processed = preprocess(data.text)
final_processed[0] #check

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u moneysymb pound prize ...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: text, Length: 5169, dtype: object


'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

### Tokenizing

In [10]:
X_data = data.text
y_data = data.isSpam

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_data) # 5169개의 행을 가진 X의 각 행에 토큰화를 수행
sequences = tokenizer.texts_to_sequences(X_data) # 단어를 숫자값, 인덱스로 변환하여 저장

In [12]:
sequences[0]

[47,
 433,
 4013,
 780,
 705,
 662,
 64,
 8,
 1202,
 94,
 121,
 434,
 1203,
 142,
 2712,
 1204,
 68,
 57,
 4014,
 137]

## Padding

In [13]:
len(sequences[0]) == len(sequences[1])

False

In [14]:
lst = []
for i in range(data.shape[0]):
    x = len(sequences[i])
    lst.append(x)
    

In [15]:
max(lst)

189

In [16]:
X_data

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5169, dtype: object

In [17]:
# 전체 데이터셋의 길이는 max_len으로 맞춥니다.
seq = pad_sequences(sequences, maxlen = 189)

In [18]:
seq.shape

(5169, 189)

In [19]:
seq

array([[   0,    0,    0, ...,   57, 4014,  137],
       [   0,    0,    0, ...,  435,    6, 1767],
       [   0,    0,    0, ...,  707,  437, 4015],
       ...,
       [   0,    0,    0, ...,   99,  240, 8919],
       [   0,    0,    0, ...,  196,   12,   53],
       [   0,    0,    0, ...,    2,   63,  283]], dtype=int32)

## `Vocab Size`

In [27]:
word_to_index = tokenizer.word_index

In [21]:
vocab_size = len(word_to_index) + 1 #항상 0번 단어 있다는 것 생각해야함. 
print('단어 집합의 크기: {}'.format((vocab_size)))

단어 집합의 크기: 8921


## Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
X, y = seq, data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=0.1)
    
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    

(4652, 189) (4652,) (517, 189) (517,)


### RNN

In [23]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential

In [24]:
model = Sequential()
model.add(Embedding(vocab_size, 32)) # 임베딩 벡터의 차원은 32
model.add(SimpleRNN(32)) # RNN 셀의 hidden_size는 32
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=4, batch_size=64, validation_split=0.2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
