In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

import tensorflow as tf
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

# Load Data

In [2]:
train0 = pd.read_excel('Data_Train.xlsx')
test0 = pd.read_excel('Data_Test.xlsx')

# Convert Pandas to Dask

In [3]:
train1 = dd.from_pandas(train0, npartitions=4)
test1 = dd.from_pandas(test0, npartitions=4)

In [4]:
target = train1.SECTION.compute()
train2 = train1.drop('SECTION', axis=1).copy()

# Combine train/test

In [5]:
data = dd.concat([train2, test1])

# Feature Transformation

In [6]:
#nltk.download('punkt')
data['STORY'] = data['STORY'].apply(nltk.word_tokenize, meta=('STORY', 'object'))

#nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
data['STORY'] = data['STORY'].apply(lambda x: [word for word in x if word.lower() not in stop_words], meta=('STORY', 'object'))

data['STORY'] = data['STORY'].apply(lambda x: [word.lower() for word in x], meta=('STORY', 'object'))

stemmer = SnowballStemmer("english")
data['STORY'] = data['STORY'].apply(lambda x: [stemmer.stem(word) for word in x], meta=('STORY', 'object'))

data['STORY'] = data['STORY'].apply(lambda x: ' '.join(x), meta=('STORY', 'object'))

data_final = data.compute()

# Split train/test

In [7]:
train_final = data_final.iloc[:len(train0), :]
test_final = data_final.iloc[len(train0):, :]

# Base Model (TFIDF/Random Forest)

In [8]:
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(random_state=0))

results=[]

kf = KFold(n_splits=5)
r = cross_val_score(pipeline, train_final.STORY, target, scoring='accuracy', cv=kf)
results.append(r)
print(f'Accuracy: {round(np.mean(results), 2)}')

Accuracy: 0.95


# Data Preprocessing

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_final.STORY)
sequences = tokenizer.texts_to_sequences(data_final.STORY)
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split train/test

In [10]:
train_final = padded_sequences[:len(train0), :]
test_final = padded_sequences[len(train0):, :]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_final, target, test_size=0.2, random_state=0)

# Define model (Embedding/CNN)

In [12]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=512, input_length=max_sequence_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax'))

# Model Compilation

In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Target Transformation

In [14]:
# Convert labels to one-hot encoding
y_train = to_categorical(y_train, 4)
y_test = to_categorical(y_test, 4)

# Model Training

In [15]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2150d798f08>

# Evaluation

In [16]:
loss, accuracy = model.evaluate(X_test, y_test, batch_size=32)
print("Test loss:", loss)
print("Test accuracy:", accuracy)

Test loss: 0.08301597088575363
Test accuracy: 0.9777195453643799


# Prediction

In [17]:
predictions = model.predict(test_final)



In [18]:
final_predictions = [np.argmax(pred) for pred in predictions]

In [19]:
submission = pd.concat([pd.Series(final_predictions, name='SECTION')], axis=1)
submission

Unnamed: 0,SECTION
0,1
1,2
2,1
3,0
4,1
...,...
2743,1
2744,1
2745,1
2746,0


In [20]:
submission.to_csv('submission_CNN_model.csv', index=False)