# Install packages

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install xformers
!pip install -U sentence-transformers

In [None]:
! pip install jsonlines
! pip install fast_ml --quiet
! pip install transformers
! pip install nltk
! python -m nltk.downloader all
! pip install unidecode

In [None]:
from unidecode import unidecode
import nltk
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
import string
import tensorflow as tf
import transformers
from textblob import TextBlob
import os
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModel, AutoModelForSequenceClassification,AutoTokenizer,pipeline

# Data Loading and transformation

In [None]:
# Load data
df_total = pd.read_parquet('/content/drive/MyDrive/Dissertation/Data/df.parquet')
# One-shot sample
df_label_unique_sample = df_total.groupby('label_cat', group_keys=False).apply(lambda df: df.sample(1))
# Create label mapping
label = list(df_label_unique_sample['label'])
id = list(df_label_unique_sample['label_cat'])
label_to_id = dict(zip(label,id))
id_to_label = dict(zip(id,label))

# Define functions

In [None]:
model_neo = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Dissertation/GPT_data/GPT_Neo_model/augmented_gptneo_100")
tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# Create a pipeline
classifier_neo = pipeline("text-classification", model=model_neo,tokenizer=tokenizer_bert)

In [None]:
# Define a function that performs text classification
def open_file(path):
  with open(path) as f:
    lines = f.readlines()
    # Remove all \n only elements
    lines = [i for i in lines if i != '\n']
    # Remove all lines that is less than 15 characters which is the new line or section
    lines = [i for i in lines if len(i)>15]
    # Remove \n for each elemtn in the text file
    lines = [i[:-1] for i in lines]
  return lines

In [None]:
# Preprocess the dataset for word embedding
def pre_process(text):
    # convert input corpus to lower case.
    text = text.lower()
    # collecting a list of stop words from nltk and punctuation form
    # string class and create single array.
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    # word_tokenize is used to tokenize the input corpus in word tokens.
    text = " ".join([i for i in word_tokenize(text) if i not in stopset])
    return text

# Lemmatization and spell check
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

# Word Tokenisationb & Lemmatization
def word_token(sentence):
  words = word_tokenize(sentence)
  for w in range(0,len(words)):
    words[w]=lemmatizer.lemmatize(words[w])
    w+=1
  words = ' '.join(words)
  return words

#Pass the processed text into the pipeline
def modelling(lines,classifier):
  processed_lines = []
  for i in range(len(lines)):
    lines_processed = pre_process(lines[i])
    line_token = word_token(lines_processed)
    result = classifier(line_token )[0]
    result['Text'] = line_token
    processed_lines.append(result)
    i+=1
  return processed_lines

In [None]:
# Find consecutive lines of text with the same labelling
def consecutive_check(file):
  consecutive_check = []
  for i in range(0,len(file)-1):
    curr_label = list(file[i].values())[0]
    next_label = list(file[i+1].values())[0]
    if curr_label == next_label:
      consecutive_check.append(1)
    else:
      consecutive_check.append(0)
    i+=1
  return consecutive_check

# Find index of those consecutive elements
def consecutive_check_index(consecutive_check):
  consecutive_check_index = []
  for i in range(len(consecutive_check)):
    if consecutive_check[i]==1:
      consecutive_check_index.append(i)
  return consecutive_check_index

# Find the index of consecutive elements stops
def non_consecutive_check_index(consecutive_check_index):
  non_consecutive_check_index = []
  for i in range(0,len(consecutive_check_index)-1):
    if consecutive_check_index[i+1] - consecutive_check_index[i]!=1:
      non_consecutive_check_index.append(i+1) # Adjust the end index
  return non_consecutive_check_index


# Insert starting index
def segment(non_consecutive_check_index1,consecutive_check_index):
  segment_consecutive_check = []
  for i in range(len(non_consecutive_check_index1)-1):
    curr = non_consecutive_check_index1[i]
    next = non_consecutive_check_index1[i+1]
    list_consecutive = consecutive_check_index[curr:next]
    segment_consecutive_check.append(list_consecutive)
    max_indices = max(segment_consecutive_check[i])
    mapping_indices = [max_indices+1, max_indices+2]
    for x in mapping_indices:
      segment_consecutive_check[i].append(x)
    i+=1
  return segment_consecutive_check

def text_concat(segment_consecutive_check,pre_processed_files):
  # Text Concatenation dictionary
  for i in range(len(segment_consecutive_check)):
    dic={}
    list_dir = segment_consecutive_check[i]
    min_dir = min(list_dir)
    max_dir = max(list_dir) # Adjust for mapping
    List_text = pre_processed_files[min_dir : max_dir]
    Text = ' '.join([d.get('Text') for d in List_text])
    Label = pre_processed_files[min_dir]['label']
    Score = max([d.get('score') for d in List_text]) # Use the average score
    dic['Text'] = Text
    dic['label'] = Label
    dic['score'] = Score
    pre_processed_files.append(dic)
    i+=1

def removal(segment_consecutive_check,pre_processed_files):
  # remove original individual text from extraction
  segment_consecutive_check_reverse = sorted(segment_consecutive_check, reverse=True)
  for i in range(len(segment_consecutive_check_reverse)):
    list_dir = segment_consecutive_check[i]
    min_dir = min(list_dir)
    max_dir = max(list_dir)
    del pre_processed_files[min_dir:max_dir]
    i+=1

def concatenate_original(lines,segment_consecutive_check): # Concatenate processed lines
  for i in range(len(segment_consecutive_check)):
    list_dir = segment_consecutive_check[i]
    min_dir = min(list_dir)
    max_dir = max(list_dir) # Adjust for mapping
    List_text = lines[min_dir : max_dir]
    Text = ' '.join([str(item) for item in List_text])
    lines.append(Text)
    i+=1
  removal(segment_consecutive_check,lines)
  return lines

In [None]:
# Return the best results for each label and find unprocessed text in original file. All has been stored into one pandas dataframe
def result_to_table(procssed_list,orig_list):
  # Maximum score of each label
  label = [d.get('label') for d in procssed_list]
  text = [d.get('Text') for d in procssed_list]
  score = [d.get('score') for d in procssed_list]
  data = {'Text': text, 'Label':label , 'score': score}
  # Get the largest score of sample with the same label
  sample = pd.DataFrame(data)
  # Add an index column
  sample['Index_ref'] = sample.index
  max_table = sample[sample.groupby('Label')['score'].transform(max) == sample['score']]
  max_table.sort_values(by=['Index_ref'],inplace=True)
  # Get the +-2 sentences for each label
  index_list = sorted(list(max_table['Index_ref']))
  #return index_list
  text_all= []
  label_all = []
  for i in index_list:
    text_range = orig_list[i-3:i+3]
    text_all.append(text_range)
  max_table['Orig_Text'] = text_all
  max_table['Label_cat'] = max_table['Label'].map(label_to_id)
  max_table.drop(columns = ['Index_ref','Text'],axis=1,inplace=True)
  # Remove Duplicate label
  max_table_final = max_table.drop_duplicates(subset=['Label','Label_cat'], keep='first')