<a href="https://colab.research.google.com/github/uknowsj/Capstone_team2/blob/master/model_class%EC%98%A4%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [None]:
import pandas as pd
from pandas import DataFrame as df

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

#Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model #모델 저장

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import time #수행시간 측정
from google.colab import files #colab에 모델 save,load

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# =============== 셋팅 =============== #

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8
MAX_LEB = 50
VOCAB_SIZE = 400000

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#전처리
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [None]:
#Colab에 연결해서 사용하기
from google.colab import drive
drive.mount('/content/gdrive') #,force_remount=True
my_path='/content/gdrive/My Drive/Colab Notebooks/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# 모델 학습에 사용될 데이터 처리

In [None]:
#학습데이터 로드
dataset=pd.read_csv(my_path+'train.csv',encoding = DATASET_ENCODING, names=DATASET_COLUMNS)
print(dataset.shape) #1600000,6

(1600000, 6)


In [None]:
decode_map = {0: NEGATIVE, 2: NEUTRAL, 4: POSITIVE} #숫자 => 분류 문장
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
%%time
dataset.target = dataset.target.apply(lambda x: decode_sentiment(x))

CPU times: user 526 ms, sys: 4.6 ms, total: 531 ms
Wall time: 537 ms


In [None]:
#학습 데이터 텍스트 전처리
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
dataset.text = dataset.text.apply(lambda x: preprocess(x)) #전처리 진행

In [None]:
train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

TRAIN size: 1280000
TEST size: 320000


In [None]:
vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text) 
x_train = tk.texts_to_sequences(train.text)
x_test = tk.texts_to_sequences(test.text)

In [None]:
labels = train.target.unique().tolist() #POSITIVE NEUTRAL NEGATIVE
labels.append(NEUTRAL)
print(labels)

encoder = LabelEncoder() #문장 -> 숫자 자동으로
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1) #1열로 자동으로 만들어줍니다.
y_test = y_test.reshape(-1,1)

['POSITIVE', 'NEGATIVE', 'NEUTRAL']


In [None]:
X_train = np.array(pad_sequences(x_train, maxlen=50, padding='post')) #max_len만큼 padding 값 설정 
print(X_train.shape, y_train.shape)

(1280000, 50) (1280000, 1)


# Build Model

In [None]:
import tensorflow as tf

In [None]:
import keras

In [None]:
class Convolution1D(keras.Model):
  def __init__(self, *args, **kwargs):
    super(Convolution1D, self).__init__(args, kwargs)
    self.embedding_layer = keras.layers.Embedding(input_dim=400000,output_dim=32,input_length=50)
    self.conv1 = keras.layers.Conv1D(filters=128, kernel_size=5, padding='same', activation='relu')
    self.pool1 = keras.layers.MaxPooling1D(pool_size=2)
    self.drop1 = keras.layers.Dropout(0.2)
    self.conv2 = keras.layers.Conv1D(filters=64, kernel_size=6, padding='same', activation='relu')
    self.pool2 = keras.layers.MaxPooling1D(pool_size=2)
    self.drop2 = keras.layers.Dropout(0.2)
    self.conv3 = keras.layers.Conv1D(filters=32, kernel_size=7, padding='same', activation='relu')
    self.pool3 = keras.layers.MaxPooling1D(pool_size=2)
    self.drop3 = keras.layers.Dropout(0.2)    
    self.conv4 = keras.layers.Conv1D(filters=32, kernel_size=8, padding='same', activation='relu')
    self.pool4 = keras.layers.MaxPooling1D(pool_size=2)
    self.drop4 = keras.layers.Dropout(0.2)  
    self.flatten = keras.layers.Flatten()
    self.dense = keras.layers.Dense(1) 

  def call(self,inputs, training=False):
    net = self.embedding_layer(keras.Input(shape=(50)))
    net = self.conv1(inputs)
    net = self.pool1(net)
    net = self.drop1(net)
    net = self.conv2(net)
    net = self.pool2(net)
    net = self.drop2(net)
    net = self.conv3(net)
    net = self.pool3(net)
    net = self.drop3(net)
    net = self.conv4(net)
    net = self.pool4(net)
    net = self.drop4(net)
    net = self.flatten(net)
    net = self.drop4(net)
    net = self.dense(net)
    return net

In [None]:
model = Convolution1D() 
input_tensor = keras.Input(shape=(50,32))
length=keras.Input(shape=(50))
model(input_tensor)
model.summary()
input_tensor[0,]

Model: "convolution1d_56"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_216 (Conv1D)          (None, 50, 128)           20608     
_________________________________________________________________
max_pooling1d_216 (MaxPoolin (None, 25, 128)           0         
_________________________________________________________________
dropout_216 (Dropout)        (None, 25, 128)           0         
_________________________________________________________________
conv1d_217 (Conv1D)          (None, 25, 64)            49216     
_________________________________________________________________
max_pooling1d_217 (MaxPoolin (None, 12, 64)            0         
_________________________________________________________________
dropout_217 (Dropout)        (None, 12, 64)            0         
_________________________________________________________________
conv1d_218 (Conv1D)          (None, 12, 32)       

<tf.Tensor 'strided_slice_24:0' shape=(50, 32) dtype=float32>

In [None]:
model.compile('SGD','mse',metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, verbose=1)
model.save('model.h5')

Epoch 1/10


ValueError: ignored