# Loading and Cleaning Datasets

In [None]:
import pandas as pd
import numpy as np
import spacy as sp
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

If using IDE, Run 
```
python -m spacy download en_core_web_sm
```
in the bash to install the english spacy pipline

## Official Datasets

### Data and Exploration

In [None]:
df = pd.read_json("../Datasets/Official/train.json")
df

In [None]:
df[df.isnull().any(axis = 1)]

In [None]:
from collections import Counter
c = Counter()
df.apply(lambda line: c.update(line.labels), axis = 1)
c_pii = c.most_common()[1:]
c_key, c_val = zip(*c_pii)
plt.barh(c_key, c_val)
plt.show()

### Preprocessing

In [None]:
df_train, df_test = train_test_split(df, test_size=0.05)
dft = df_test.head(10)

In [None]:
pattern = '(\xa0|\uf0b7)'
df.loc[:,'full_text'] = df.loc[:,'full_text'].replace(pattern, ' ')
df.loc[:,'tokens'] = df.loc[:,'tokens'].apply(lambda line: [tok for tok in line if not re.search(pattern1,tok)])

# Model


In [None]:
sample_row = df.iloc[16]
sample_tokens = sample_row.tokens
sample_labels = sample_row.labels
# sample_row
sample_tokens
# sample_labels

In [None]:
# encoding = tokenizer.encode_plus(
#     sample_text,
#     add_special_tokens=True,
#     max_length=512,
#     return_token_type_ids=False,
#     padding="max_length",
#     return_attention_mask=True,
#     return_tensors='pt',
# )
# 
# encoding["input_ids"].squeeze()[:20]
# encoding["attention_mask"].squeeze()[:20]

In [None]:
import os

from transformers import BertForTokenClassification
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import transformers
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import glob

import datetime

import warnings
warnings.filterwarnings('ignore')

In [None]:
# token_counts = []
# for _, row in df_train.iterrows():
#     token_count = len(tokenizer.encode(
#         row["full_text"],
#         max_length=2048,
#         truncation=True
#     ))
#     token_counts.append(token_count)
# sns.histplot(token_counts)
# # plt.xlim([0, 512]);

In [None]:
# platform = 'Kaggle'
platform = 'local'
model_name = 'model1_bert_base_cased.bin'

if platform == 'Kaggle':
    bert_path = '../input/huggingface-bert/bert-base-uncased/'
    train_path = '/kaggle/input/coleridgeinitiative-show-us-the-data/train/'
    test_path = '/kaggle/input/coleridgeinitiative-show-us-the-data/test/*'
    model_path = '../input/coleridgemodels/' + model_name
elif platform == 'local':
    bert_path = 'bert-base-cased'
    model_path = '../models/bert_models/' + model_name

config = {
        'MAX_LEN': 512,
        'tokenizer': BertTokenizer.from_pretrained(bert_path),
        'batch_size':5,
        'Epoch': 1,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'model_name':model_name
        }

In [ ]:
# def make_shorter_sentence(sentence):
#     '''
#     This function is to split the long sentences into chunks of shorter sentences upto the 
#     maximum length of words specified in config['MAX_LEN']
#     '''
#     sent_tokenized = sent_tokenize(sentence)
# 
#     max_length = config['MAX_LEN']
#     overlap = 20
# 
#     final_sentences = []
# 
#     for tokenized_sent in sent_tokenized:
#         sent_tokenized_clean = sent_tokenized_clean.replace('.','').rstrip()
# 
#         tok_sent = sent_tokenized_clean.split(" ")
# 
#         if len(tok_sent)<max_length:
#             final_sentences.append(sent_tokenized_clean)
#         else :
#             #             print("Making shorter sentences")
#             start = 0
#             end = len(tok_sent)
# 
#             for i in range(start, end, max_length-overlap):
#                 temp = tok_sent[i: (i + max_length)]
#                 final_sentences.append(" ".join(i for i in temp))
# 
#     return final_sentences

In [None]:
# def labelling(dataset, data_dict):
#     '''
#     This function is to iterate each of the training data and get it labelled 
#     from the form_labels() function.
#     '''
# 
#     Id_list_ = []
#     sentences_ = []
#     key_ = []
#     labels_ = []
#     un_mat = []
#     un_matched_reviews = 0
# 
#     for i, Id in tqdm(enumerate(dataset.Id), total=len(dataset.Id)):
# 
#         sentence = data_joining(data_dict[Id])
#         labels = train_df.label[train_df.Id == Id].tolist()[0].split("|")
# 
#         s, k, l, un_matched = form_labels(sentence=sentence, labels_list = labels)
# 
#         if len(s) == 0:
#             un_matched_reviews += 1
#             un_mat.append(un_matched)
#         else:
#             sentences_.append(s)
#             key_.append(k)
#             labels_.append(l)
#             Id_list_.append([Id]*len(l))
# 
#     print("Total unmatched keywords:", un_matched_reviews)
#     sentences = [item for sublist in sentences_ for item in sublist]
#     final_labels = [item for sublist in labels_ for item in sublist]
#     keywords = [item for sublist in key_ for item in sublist]
#     Id_list = [item for sublist in Id_list_ for item in sublist]
# 
#     return sentences, final_labels, keywords, Id_list

In [None]:
test = df[df['document'] == 7]
test

In [None]:
test_s = pd.Series(line.labels[0])
test_s

In [None]:
test_s[test_s != 'O'].index

In [None]:
df.apply(lambda line: pd.Series(line['tokens'][0]).loc[pd.Series(line.labels[0])[pd.Series(line.labels[0]) != 'O'].index],axis = 1)

In [None]:
df