In [72]:
import tensorflow as tf
from tensorflow import keras
from keras.losses import sparse_categorical_crossentropy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
import numpy as np

In [73]:
train_data = pd.read_csv("processed_training.csv")
train_texts = train_data["text"]
train_labels = train_data["label"]
test_data = pd.read_csv("processed_test.csv")
test_texts = test_data["text"]

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_texts)

vocab_len = len(tokenizer.word_index)

In [74]:
countVectorizer = CountVectorizer()
tfidfTransformer = TfidfTransformer()
training_count_matrix = countVectorizer.fit_transform(train_texts)
training_tfidf_matrix = tfidfTransformer.fit_transform(training_count_matrix)

In [75]:
max_sentence_length_train = 61
max_sentence_length_test = 28

In [76]:
def onehotmatrix(texts,max_sen_len):
  listoflists = tokenizer.texts_to_sequences(texts)
  for list in listoflists:
    list.extend([0]*(max_sen_len-len(list)))
  return np.array([np.array(e) for e in train_onehot])

train_onehot = onehotmatrix(train_texts,61)
test_onehot = onehotmatrix(test_texts,61)

In [77]:
model = keras.Sequential([
                          keras.layers.Embedding(input_dim=vocab_len+1,output_dim=64,mask_zero=True),
                          keras.layers.GlobalAveragePooling1D(),
                          keras.layers.Dense(units=16,activation='relu'),
                          keras.layers.Dense(units=6,activation='softmax')
])

In [78]:
model.compile(optimizer='adam',loss=sparse_categorical_crossentropy,metrics=['acc'])

In [79]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 64)          897344    
                                                                 
 global_average_pooling1d_3   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 16)                1040      
                                                                 
 dense_7 (Dense)             (None, 6)                 102       
                                                                 
Total params: 898,486
Trainable params: 898,486
Non-trainable params: 0
_________________________________________________________________


In [80]:
model.fit(train_onehot[1000:], train_labels[1000:], validation_data=(train_onehot[:1000],train_labels[:1000]), epochs = 10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f471b0d2610>

In [92]:
prediction_prob = model.predict(test_onehot)
prediction_labels = []
for row in prediction_prob:
  prediction_labels.append(list(row).index(max(row)))
prediction_labels
submission = pd.DataFrame(list(zip(list(range(1,2001)), prediction_labels)),columns =['id', 'label']).to_csv("submission.csv",index=False)