In [None]:
# Installing the necessary files

!pip install transformers
!pip install simpletransformers

In [3]:
# Importing the files
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import re, os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import torch
import pickle

In [None]:
# Unzipping the data file
!unzip '/content/drive/MyDrive/Colab Datasets/Document Tagging.zip'

In [4]:
# Paths for the respective files
train_doc_path = '/content/Document Tagging/Train_docs'
train_tag_path = '/content/Document Tagging/Train_tags'
test_doc_path = '/content/Document Tagging/Test_docs'

In [5]:
# Reading all the training documnets into train_docs list

train_docs = []

for file in (os.listdir(train_doc_path)):
 train_docs.append(open(os.path.join(train_doc_path, file), mode = 'rb').read().lower())

In [6]:
# Encoding the documnets since it has been read in the bytes format
encoded_docs = []

for docs in train_docs:
  encoded_docs.append(docs.decode("latin-1"))

In [7]:
# Reading all the training tags into tag_docs list

tag_docs = []

for file in (os.listdir(train_tag_path)):
 tag_docs.append(open(os.path.join(train_tag_path, file), mode = 'rb').read().lower())

In [8]:
# Encoding the tags since it has been read in the bytes format
encoded_tags = []

for tags in tag_docs:
  encoded_tags.append(tags.decode("latin-1"))

In [9]:
# Reading all the test docs into test_docs list
test_docs = []

for test in (os.listdir(test_doc_path)):
 test_docs.append(open(os.path.join(test_doc_path, test), mode = 'rb').read().lower())

In [10]:
# Encoding the test docs since it has been read in the bytes format
encoded_test = []

for tst in test_docs:
  encoded_test.append(tst.decode("latin-1"))

In [11]:
# Creating a pandas dataframe with Docs and Tags as columns
train_df = pd.DataFrame(encoded_docs, encoded_tags).reset_index()

In [12]:
# Naming the columns as input_text and target_text in order to provide this df as input to the model.
# Renaming it some other column names causes error

train_df.columns = ['target_text', 'input_text']

In [13]:
test_df = pd.DataFrame(encoded_test)
test_df.columns = ['input_text']

In [14]:
from simpletransformers.seq2seq import Seq2SeqModel

In [15]:
# Separating 0.1% of training data as evaluation data
eval_df = train_df.sample(frac=0.1, random_state=42)

train_df = train_df.drop(eval_df.index)

# model parameters
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "max_seq_length": 512,
    "train_batch_size": 1,
    "num_train_epochs": 2,
}

# Check if GPU supports CUDA
cuda_available = torch.cuda.is_available()

# Create a Bart-base model
model = Seq2SeqModel(encoder_decoder_type="bart",
                    encoder_decoder_name="facebook/bart-base",
                    args=model_args, use_cuda = False)


In [35]:
# Empty the cache to avoid CUDA ran out of memory issues
torch.cuda.empty_cache()

# Train the maodel
model.train_model(train_df, batch_size = 1)

# Evaluate the model
result = model.eval_model(eval_df)
print(result)

  0%|          | 0/48 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/48 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.846632719039917}


In [36]:
# Saving the model for future use
pickle.dump(model, open('document_tagging.pkl', 'wb'))

In [None]:
# Testing the accuracy of predictions

test_df = eval_df

for idx, row in test_df.iterrows():

    plot = row['input_text']
    true_title = row['target_text']

    # Predict with trained BART model
    predicted_title = model.predict([plot])[0]

    print(f'True Title: {true_title}\n')
    print(f'Predicted Title: {predicted_title}\n')
    print(f'Plot: {plot}\n\n\n')

In [16]:
# Laoding the saved model

model = pickle.load(open('document_tagging.pkl', 'rb'))

In [None]:
# Passing the test_docs dataframe to generate the tags for it

predicted_title = []

for idx, row in test_df.iterrows():

    plot = row['input_text']

    # Predict with trained BART model
    title = model.predict([plot])[0]
    
    predicted_title.append(title)

In [18]:
# Creating a dataframe 

output_df = pd.DataFrame(list(zip(test_df['input_text'], predicted_title)))

In [19]:
output_df.columns = ['Doc', 'Predicted Title']

In [20]:
# Final doc and predicted tag for it
output_df

Unnamed: 0,Doc,Predicted Title
0,"\r\n\r\nv. gopala gowda, j.\r\n\r\n1. leave gr...","ab, appeal, appeal"
1,"\r\n\r\n1. by order dated 10.12.2013, we had i...","acc, action, action"
2,"\r\n\r\na.k. sikri, j.\r\n\r\n1. in all these ...","ab, appeal, appeal"
3,"\r\n\r\nbrijesh kumar, j.\r\n\r\n1. the centra...","ab, court, court of appeal, court order, court..."
4,\r\n\r\n1. the appeals have been preferred by ...,"ab, court, court of appeal, court order, court..."
...,...,...
96,\r\n\r\n1. this appeal is directed against the...,"ab, appeal, appeal"
97,"\r\n\r\n1. this court on 23.09.2016, while ent...","ab, of course, of interest, of the court, of c..."
98,"\r\n\r\ns.b. sinha, j.\r\n\r\n1. scope and amb...","ab, appeal, appeal"
99,"\r\n\r\nshiva kirti singh, j.\r\n\r\n1. appell...","ab, court, court of appeal, court order, court..."


In [21]:
output_df.to_csv('Predicted tags.csv')