<a href="https://colab.research.google.com/github/DieKim/SBA-Elice_Project_NLP/blob/main/sba_project_emoing/StratifiedKFold_Categorical_Crossentropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bidirectional + StratifiedKFold + Categorical Crossentropy

---

## 3. 딥러닝 모델

### 3-1. 모델 설정

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, SimpleRNN, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

In [None]:
sequence_length = 35 # max(num_token_per_sentence)
vocabulary_size = len(tokenizer.word_index)+1 # 10376
embedding_dim = 256

In [None]:
## https://www.tensorflow.org/tutorials/text/text_classification_rnn

from tensorflow.keras.layers import Bidirectional

model = Sequential([            
  Embedding(vocabulary_size, embedding_dim, mask_zero=True),
  Bidirectional(LSTM(64,  return_sequences=True)),
  Bidirectional(LSTM(32)),
  Dense(64, activation='relu'),
  Dropout(0.5),
  Dense(1, activation = 'softmax')
])

# 모델구조보기
print(model.summary())

## 모델 compile option주기
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 256)         2969088   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         164352    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 3,178,881
Trainable params: 3,178,881
Non-trainable params: 0
____________________________________________

### 3-2. 데이터 split 및 학습 수행

In [None]:
print(X_train_pad.shape)
print(Y_train.shape)

(20000, 35)
(20000,)


In [None]:
# to split train/test datasets having equal classes proportion >> StratifiedKFold 
from sklearn.model_selection import StratifiedKFold 

skf = StratifiedKFold(shuffle=True, random_state=42) # n_splits = 5

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []

fold_no = 1
for train_idx, test_idx in skf.split(X_train_pad, Y_train):
  # print("TRAIN:", train_idx, "TEST:", test_idx)

  X_train, X_test, y_train, y_test = X_train_pad[train_idx], X_train_pad[test_idx], Y_train[train_idx], Y_train[test_idx]
  
  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')
  
  # Fit data to model
  history = model.fit(X_train_pad[train_idx], Y_train[train_idx], batch_size=64, epochs=10)

  # Generate generalization metrics
  scores = model.evaluate(X_train_pad[test_idx], Y_train[test_idx], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')


------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 1: loss of 0.0; accuracy of 13.54999989271164%
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 2: loss of 0.0; accuracy of 13.525000214576721%
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 3: loss of 0.0; accuracy of 13.54999989271164%
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoc

### 3-3. 예측

In [None]:
# 데이터셋 불러오기
df_score = pd.read_csv('/content/drive/MyDrive/sentimental_analysis_project/data/score.csv',header=0, sep=';',names=['Sentence'],encoding='utf-8')
stemming_sentence_score = []

for sentence in df_score['Sentence']:
    ## training데이터에서 했던것처럼, word_tokenizer와 stemmer를 사용하여 word token sequence로 만들어주세요
    word_token = word_tokenize(sentence) # toknize
    word_token = [word for word in word_token if not word in stop_words] # remove stopwords
    word_token = [stemmer.stem(word) for word in word_token] # stemming(어간추출)
    
    stemming_sentence_score.append(word_token)

## stemming_sentence_score,를 bow로 표현하기
X_score = tokenizer.texts_to_sequences(stemming_sentence_score)

## X_score를 padding을 붙여 일정길이로 만들어주기
X_score_pad = pad_sequences(X_score, maxlen=35, padding='post')

In [None]:
# 모델에서 라벨 추측하기
score_prediction = model.predict(X_score_pad)

score_prediction = pd.Series(score_prediction.flatten())
score_prediction.value_counts()

1.0    1000
dtype: int64

In [None]:
# 결과값 저장 
submission_df = pd.DataFrame({"Emotion" : score_prediction_label})
submission_df

submission_df.to_csv('/content/drive/MyDrive/sentimental_analysis_project/submission.csv', index=False,)