In [1]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import sklearn
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.tokenize import word_tokenize,sent_tokenize
import sentencepiece as spm
import keras
from fastai.text import *
import re
import pdb
import fastai

from sklearn.preprocessing import LabelEncoder

In [2]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# train.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('train.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'train.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 383 rows and 2 columns


In [3]:
df.head(5)
df.columns

Index(['Sloka', 'Class'], dtype='object')

In [4]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# valid.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df2 = pd.read_csv('valid.csv', delimiter=',', nrows = nRowsRead)
df2.dataframeName = 'valid.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 96 rows and 2 columns


In [5]:
df2.head(5)

Unnamed: 0,Sloka,Class
0,यो यस्मिन् कर्माणि कुशलस्तं तस्मित्रैव योजयेत्...,sanskrit-slogan
1,अध्ययनेन/अध्ययनं वीना ज्ञानं न भवति ॥,sanskrit-slogan
2,पुष्पं पुष्पं विचिन्वीत मूलच्छेदं न कारयेत् । ...,Vidur Niti Slokas
3,मृजया रक्ष्यते रूपम् ॥,sanskrit-slogan
4,मूर्खश्चिरायुर्जातोऽपि तस्माज्जातमृतो वरः। ...,Chanakya Slokas


In [6]:
df1 = pd.concat([df, df2])
df1.head(5)
print(df1.shape)

(479, 2)


In [7]:
sloka = df1['Sloka']
labels = df1['Class']

In [8]:
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
print(encoded_labels)

[1 1 0 2 ... 2 2 0 0]


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sloka, encoded_labels,stratify=encoded_labels, test_size=0.1)

In [11]:
# Hyperparameters of the model
vocab_size = 1000
oov_tok = '<OOK>'
embedding_dim = 100
max_length = 30
padding_type='post'
trunc_type='post'

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [13]:

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [14]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(128)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

# compile model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           100000    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 24)                6168      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 75        
Total params: 340,739
Trainable params: 340,739
Non-trainable params: 0
_________________________________________________________________


In [15]:
test_labels_one_d = []
for i in test_labels:
    test_labels_one_d.append(i)
train_labels = keras.utils.to_categorical(train_labels, 3)
test_labels = keras.utils.to_categorical(test_labels, 3)

In [16]:
num_epochs = 30
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [17]:
# Calculate accuracy on Test data
'''
prediction = model.predict(test_padded)

'''
# Get probabilities

prediction = model.predict(test_padded)
print(prediction)



predicted_labels = []
for predicted_scores in prediction:
    if predicted_scores[0] >= predicted_scores[1] and predicted_scores[0] >= predicted_scores[2]:
        predicted_labels.append(0)
    elif predicted_scores[1] >= predicted_scores[0] and predicted_scores[1] >= predicted_scores[2]:
        predicted_labels.append(1)
    else:
        predicted_labels.append(2)
# Accuracy : one can use classification_report from sklearn
correctly_classified = 0
wrongly_classified = 0
for predicted_label,true_label in zip(predicted_labels,test_labels_one_d):
    if predicted_label == true_label:
        correctly_classified += 1
    else:
         wrongly_classified += 1

print(f'accuracy = {(correctly_classified/(correctly_classified + wrongly_classified))*100} %')

[[8.760914e-04 9.991239e-01 2.156829e-08]
 [1.263457e-01 8.736537e-01 5.889838e-07]
 [9.942180e-01 5.781827e-03 7.170393e-08]
 [2.939606e-05 1.180469e-03 9.987902e-01]
 ...
 [3.748699e-04 9.996251e-01 2.398419e-08]
 [8.013907e-04 9.991986e-01 2.812356e-08]
 [9.981793e-01 1.820561e-03 1.304962e-07]
 [2.531907e-02 9.746805e-01 5.127728e-07]]
accuracy = 85.41666666666666 %
