In [None]:
import sys
from indicnlp import common
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"D:\6th SEM\Open Lab\indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"D:\6th SEM\Open Lab\indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [1]:
import pandas as pd
news = pd.read_csv('lemmatized_news.csv', encoding='utf-8')

In [None]:
news.isnull().sum(axis = 0)
news.head()

In [None]:
pd.set_option('display.max_colwidth', None)
news.head()

In [2]:
import preProcessing
from tqdm import tqdm

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 51.4MB/s]                    
2022-06-07 19:15:10 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |
| pos       | hdtb    |
| lemma     | hdtb    |

2022-06-07 19:15:10 INFO: Use device: cpu
2022-06-07 19:15:10 INFO: Loading: tokenize
2022-06-07 19:15:11 INFO: Loading: pos
2022-06-07 19:15:11 INFO: Loading: lemma
2022-06-07 19:15:12 INFO: Done loading processors!


In [None]:
df = preProcessing.eachNews(news)

In [None]:
df.head()

In [3]:
path = 'ai4bharat/indic-bert'
## Loading the model
from transformers import AutoModel, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path,output_hidden_states=True)


Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['sop_classifier.classifier.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
## Embeddings 
final = []
for i in tqdm(news.index):
    temp = tokenizer.convert_ids_to_tokens(tokenizer.encode(news['News'][i]))
    input_encoded = tokenizer.encode_plus(news['News'][i], return_tensors="pt")
    with torch.no_grad():
        states = model(**input_encoded).hidden_states
    output = torch.stack([states[i] for i in range(len(states))])
    #output = output.squeeze()
    token_vecs = output[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    sentence_embedding = sentence_embedding.tolist()
    final.append(sentence_embedding)

print("Output shape is {}".format(output.shape))

100%|██████████| 4588/4588 [04:41<00:00, 16.27it/s]

Output shape is torch.Size([13, 1, 39, 768])





In [7]:
import numpy as np
final = np.array(final)

In [8]:
final.shape

(4588, 768)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(final, np.array(news['Label'].tolist()), train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
import dill

SVM = svm.SVC(C=50, kernel='rbf', gamma='auto')
SVM.fit(X_train,y_train)
dill.dump(SVM, open("SVM_Model.sav","wb"))
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_rem)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",round((accuracy_score(predictions_SVM, y_rem)*100),2),"%")

SVM Accuracy Score ->  97.6 %


In [None]:
from sklearn.metrics import (precision_recall_curve,PrecisionRecallDisplay)

precision, recall, _ = precision_recall_curve(y_rem, predictions_SVM)
disp = PrecisionRecallDisplay(precision=precision, recall=recall)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import roc_curve

pred = SVM.predict(X_rem).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_rem, pred)

In [None]:
from sklearn.metrics import auc
auc_keras = round((auc(fpr_keras, tpr_keras)*100),2)

In [None]:
import matplotlib.pyplot as plt

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='SVM Classifier AUC = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [12]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.models import Model, Sequential
import keras
from keras import layers
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

In [13]:
cnnModel = Sequential()
cnnModel.add(layers.LSTM(units=50, activation='relu', return_sequences=True, 
                input_shape=(X_train.shape[1], 1)))
cnnModel.add(layers.Dropout(0.3))

cnnModel.add(layers.Convolution1D(32, 4,activation='relu'))
cnnModel.add(layers.Dropout(0.4))

cnnModel.add(layers.AveragePooling1D())

cnnModel.add(layers.Convolution1D(64, 4,activation='relu'))
cnnModel.add(layers.Dropout(0.4))

cnnModel.add(layers.AveragePooling1D())

cnnModel.add(layers.Flatten())
cnnModel.add(layers.Dropout(0.5))

cnnModel.add(layers.Dense(units=1, activation='sigmoid'))

In [14]:
cnnModel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 768, 50)           10400     
                                                                 
 dropout (Dropout)           (None, 768, 50)           0         
                                                                 
 conv1d (Conv1D)             (None, 765, 32)           6432      
                                                                 
 dropout_1 (Dropout)         (None, 765, 32)           0         
                                                                 
 average_pooling1d (AverageP  (None, 382, 32)          0         
 ooling1D)                                                       
                                                                 
 conv1d_1 (Conv1D)           (None, 379, 64)           8256      
                                                        

In [15]:
cnnModel.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), 
                    metrics=[tf.keras.metrics.BinaryAccuracy()])

checkpoint = ModelCheckpoint('modelCaseIndic.h5',verbose=1, monitor='val_binary_accuracy',save_best_only=True, mode='auto')

#checkpoint = ModelCheckpoint('model.h5',verbose=1, monitor='loss',save_best_only=True, mode='auto')


cnnModel.fit(x=X_train, y=y_train, batch_size=256, epochs=25, 
                validation_data=(X_valid,y_valid), callbacks=[checkpoint])

Epoch 1/25
Epoch 1: val_binary_accuracy improved from -inf to 0.48366, saving model to modelCaseIndic.h5
Epoch 2/25
Epoch 2: val_binary_accuracy improved from 0.48366 to 0.77996, saving model to modelCaseIndic.h5
Epoch 3/25
Epoch 3: val_binary_accuracy improved from 0.77996 to 0.82353, saving model to modelCaseIndic.h5
Epoch 4/25
Epoch 4: val_binary_accuracy improved from 0.82353 to 0.82789, saving model to modelCaseIndic.h5
Epoch 5/25
Epoch 5: val_binary_accuracy improved from 0.82789 to 0.92375, saving model to modelCaseIndic.h5
Epoch 6/25
Epoch 6: val_binary_accuracy did not improve from 0.92375
Epoch 7/25
Epoch 7: val_binary_accuracy did not improve from 0.92375
Epoch 8/25
Epoch 8: val_binary_accuracy improved from 0.92375 to 0.93028, saving model to modelCaseIndic.h5
Epoch 9/25
Epoch 9: val_binary_accuracy did not improve from 0.93028
Epoch 10/25
Epoch 10: val_binary_accuracy improved from 0.93028 to 0.93246, saving model to modelCaseIndic.h5
Epoch 11/25
Epoch 11: val_binary_accur

<keras.callbacks.History at 0x1efe1419660>

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import dill
from sklearn.metrics import accuracy_score

sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train)
xtest = sc_x.transform(X_rem)

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

dill.dump(classifier, open('logisticCase.sav', 'wb'))
y_pred = classifier.predict(X_rem)

print (f"Accuracy : {(accuracy_score(y_rem, y_pred)*100):.2f}%")

Accuracy : 98.15%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
