In [2]:
import pandas as pd
import numpy as np
import cufflinks as cf
from preprocess_text import Preprocess
from data_preprocessing import *
from time import process_time

In [3]:
training_data = pd.read_csv('../dataset/original-dataset/marathi-training-data.csv')
val_data = pd.read_csv('../dataset/original-dataset/marathi-validation-data.csv')
training_data = training_data.dropna()
training_data.head()

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy


In [4]:
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

training_data['label'].value_counts().sort_values(ascending=False).iplot(kind = 'bar', yTitle = 'Number of records', xTitle = 
                                                               'Category',title='Training Data Overview')

In [5]:
sample = training_data.sample()
print('Label : \n\n',sample['label'].values[0],'\n\nText :\n\n',sample['text'].values[0])

Label : 

 cse 

Text :

 म्हणून , ते प्रामाणिकपणावर किंवा आम्ही काय म्हणतो ते यावर आक्रमण आहे .


In [6]:
stopword_list = []
with open ('../dataset/marathi_stopwords.txt','r',encoding='utf') as st:
    st_content = st.read()
    st_list = set(st_content.split())
    stopword_list = st_list

pp = Preprocess([])

In [7]:
preprocessed_text = pp.clean_text(sample['text'].values[0])
print(preprocessed_text)

म्हणून ते प्रामाणिकपणावर किंवा आम्ही काय म्हणतो ते यावर आक्रमण आहे


In [8]:
training_data['word count'] = training_data['text'].apply(lambda x : len(str(x).split(' ')))
training_data[['text','word count']].head()

Unnamed: 0,text,word count
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",48
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",44
2,- - - - - - - - - - - - - - - - - - - - - - - ...,107
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",9
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",29


In [9]:
training_data['word count'].describe()

count    41997.000000
mean        27.040431
std         25.178213
min          2.000000
25%         13.000000
50%         20.000000
75%         32.000000
max        404.000000
Name: word count, dtype: float64

In [10]:
training_data['word count'].iplot(kind = 'hist', xTitle = "word count", yTitle = 'records', 
                            title = "Histogram depicting distibution of word count across training data" )

In [11]:
# The maximum number of words to be used. (most frequent)
word_limit = 55000
# Max number of words in each complaint.
max_word_len = 100
# Length of word vector
embedding_dim = 300

In [12]:
# x_train = training_data.text.apply(lambda x: pp.clean_text(x)).values.tolist()
x_train = training_data['text'].apply(lambda x : pp.clean_text(x)).tolist()
y_train = training_data.label.values.tolist()
x_val = val_data.text.apply(lambda x: pp.clean_text(x)).tolist()
y_val = val_data.label.values.tolist()
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

41997
41997
3780
3780


In [13]:
y_train, y_val = label_encoder(y_train, y_val)

In [14]:
y_train = y_train.reshape(y_train.shape[0], -1)
y_val = y_val.reshape(y_val.shape[0], -1)
print(y_train.shape)
print(y_val.shape)

(41997, 1)
(3780, 1)


In [15]:
pad_len= 50
padding_type='post'
truncating_type='post'

tokenizer, x_train_padded, x_val_padded = tokenizer_and_pad_training(x_train, 
                                                                     x_val, 
                                                                     pad_len, 
                                                                     padding_type, 
                                                                     truncating_type)

In [16]:
vocab = tokenizer.word_index
print(x_train_padded.shape)
print(x_val_padded.shape)
print(len(vocab))

(41997, 50)
(3780, 50)
52507


In [17]:
start = process_time()
embedding_path = "C:/Users/Amey/Desktop/Project Hub/Machine Learning/NLP/Projects/Technodifacation/Embeddings/DS/DS_fasttext_skipgram_raw_300.vec"
embedding_matrix = get_embedding_matrix(embedding_path, vocab,embedding_dim = 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix.shape

51273
Total time taken:  12.09375


(52508, 300)

In [18]:
labels = pd.get_dummies(training_data['label'].values)
labels.shape

(41997, 4)

In [19]:
from model_architecture import Models

Using TensorFlow backend.


In [20]:
models = Models()

In [21]:
myLSTM = models.myLSTM_1(embedding_matrix, num_records=embedding_matrix.shape[0],
                         pad_len = pad_len,embedding_dim = 300, num_labels = 4)

In [22]:
from keras.callbacks import EarlyStopping

In [24]:
 history1 = myLSTM.fit(x_train_padded, labels, epochs = 6, batch_size = 64, validation_split=0.1,
                      callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on 37797 samples, validate on 4200 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [50]:
# y_val = pd.get_dummies(val_data['label'].values)
# y_test = pd.get_dummies(training_data['label'].values)

In [25]:
results = np.argmax(myLSTM.predict(x_val_padded), axis=-1)

In [26]:
from model import classification_report
acc, precision, recall, f1 = classification_report(y_val, results)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8902116402116402

Precision:  [0.85213033 0.89973788 0.89373602 0.88761707]
Average Precision:  0.8833053215176092

Recall:  [0.80952381 0.91229236 0.90282486 0.87938144]
Average nRecall:  0.8760056175959569

F1-Score:  [0.83028083 0.90597163 0.89825745 0.88348006]
Average F1-Score:  0.8794974917387983
