# Import packages

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install xformers
!pip install -U sentence-transformers

In [None]:
! pip install jsonlines
! pip install fast_ml --quiet
! pip install transformers
! pip install nltk
! python -m nltk.downloader all
! pip install unidecode

In [None]:
from unidecode import unidecode
import nltk
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
import string
import tensorflow as tf
import transformers
from textblob import TextBlob
import os
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModel, AutoModelForSequenceClassification,AutoTokenizer,pipeline

# Define functions

In [None]:
# Define a function that assist label: premise/use classification (primary key)
def open_file_premise(path):
  with open(path, encoding = 'utf-8') as f:
    lines = f.readlines()
    # Remove all \n only elements
    lines = [i for i in lines if i != '\n']
    # Remove all lines that is less than 15 characters which is the new line or section
    lines = [i for i in lines if len(i)>10]
    # Remove \n for each elemtn in the text file
    lines = [i[:-1] for i in lines]
    # Select first 50 lines
    lines = lines[:50]
  return lines


# Find label: premise if the score from text classification is low
def find_premise(filepath):
  lines = open_file_premise(filepath)
  for i in lines:
    if (('relating to' in i.lower()) or ('related to' in i.lower())):
        anc_index = lines.index(i)
        line_extract = lines[anc_index+1:anc_index+3]
        break
    elif 'lr4' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index+1:anc_index+4]
        break
    elif 'property known' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index:anc_index+2]
        break
    elif 'premise' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index:anc_index+2]
        break
  return line_extract

# Find label: use if the score from text classification is low
def find_use(filepath):
  lines = open_file(filepath)
  for i in lines:
    if 'permitted use' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index:anc_index+3]
        break
    elif 'use as' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index:anc_index+3]
        break
    elif 'purpose other' in i.lower():
        anc_index = lines.index(i)
        line_extract = lines[anc_index+1:anc_index+3]
        break
    else:
        line_extract = 'None'
  return line_extract

In [None]:
# Trimming
def remove_list_df(df):
  df = df.astype(str)
  df = df.applymap(lambda x: x[2:len(x)-2] if len(x)>=15 else x) # Avoid removing 'yes', 'no','others','none' value
  # Remove some special characters
  remove_character= ["'", '"', '“', '”',':','~','[',']']
  for i in remove_character:
    df= df.applymap(lambda x: x.replace(i, ''))
  return df

In [None]:
 Post-output transformation on text data
def post_output_transformation(dataframe,filepath_collection):
    df_cleaned_trimmed = dataframe.copy()
    rent_review_method = []
    rent_review_date = []
    comm_date = []
    rent_comm_date = []
    term_period = []
    Landlord = []
    Tenant = []
    Premise = []
    Use = []
    for rows in range(len(df_cleaned_trimmed)):
      # Rent Review method clean
      test = []
      df_review = df_cleaned_trimmed['Rent_Review_method'][rows]
      for i in range(len(df_review)):
        text_review_method = df_review[i].lower()
        if 'open market' in text_review_method:
          test.append('Open Market')
        else:
          test.append('Other')
        i+=1
      for i in test:
        if 'Open Market' in i:
          test = 'Open Market'
        else:
          test = 'Other'
      df_cleaned_trimmed.loc[rows,'Rent_Review_method'] = test
      # Rent review date clean
      test_1 = []
      df_review_date = df_cleaned_trimmed['Remain_review_date'][rows]
      for i in range(len(df_review_date)):
        text_review_date = df_review_date[i].lower()
        if 'review date' in text_review_date:
          test_1.append(text_review_date)
        i+=1
      if test_1 == []:
        test_1.append('None')
      df_cleaned_trimmed.loc[rows,'Remain_review_date'] = test_1
      # Commencement date
      test_3 = []
      df_review_comm_date = df_cleaned_trimmed['Commence_date'][rows]
      for i in range(len(df_review_comm_date)):
        text_comm_date = df_review_comm_date[i].lower()
        text_review_date_cross = df_review_date[i].lower()
        if df_review_comm_date != 'None':
          if 'commencement date' in text_comm_date:
            test_3.append(text_comm_date)
          elif 'commencement date' in text_review_date_cross:
            test_3.append(text_review_date_cross)
        elif 'commencement date' in text_review_date_cross:
          test_3.append(text_review_date_cross)
        i+=1
      if test_3 == []:
        test_3.append('None')
      df_cleaned_trimmed.loc[rows,'Commence_date'] = test_3

      # Rent commencement date
      test_4 = []
      df_rent_date = df_cleaned_trimmed['Rent_Commence_date'][rows]
      for i in range(len(df_rent_date)):
        text_rent_comm_date = df_rent_date[i].lower()
        if 'rent commencement' in text_rent_comm_date:
          test_4.append(text_rent_comm_date)
        i+=1
      if test_4 == []:
        test_4.append('None')
      df_cleaned_trimmed.loc[rows,'Rent_Commence_date'] = test_4

      # term_period
      test_5 = []
      df_term_period = df_cleaned_trimmed['Term_period'][rows]
      for i in range(len(df_term_period)):
        text_term_period = df_term_period[i].lower()
        if (('years from' in text_term_period) or ('years commencing' in text_term_period)):
          test_5.append(text_term_period)
        i+=1
      if test_5 == []:
        test_5.append('None')
      df_cleaned_trimmed.loc[rows,'Term_period'] = test_5

      # Parties
      landlord_test = []
      tenant_test = []
      df_parties = df_cleaned_trimmed['Parties'][rows]
      for i in range(len(df_parties)):
        text_parties = df_parties[i].lower()
        if (('(1)' in text_parties) or ('landlord' in text_parties)):
          landlord_test.append(text_parties)
        elif (('(2)' in text_parties) or ('tenant' in text_parties)):
          tenant_test.append(text_parties)
        i+=1
      if tenant_test == []:
        tenant_test.append('CVS Limited') # CVS Leases
      if landlord_test == []:
        tenant_test.append('None')
      Landlord.append(landlord_test)
      Tenant.append(tenant_test)

      # Use
      use_test = []
      df_use = df_cleaned_trimmed['Use'][rows]
      for i in range(len(df_use)):
        text_use = df_use[i].lower()
        if (('permitted use' in text_use) or ('use as' in text_use)):
          use_test.append(text_use)
        i+=1
      if use_test ==  []:
        use_test.append(find_use(filepath_collection[rows]))
      df_cleaned_trimmed.loc[rows,'Use'] = use_test

      # Premise
      premise_test = []
      df_premise = df_cleaned_trimmed['Premise'][rows]
      target_word = ['property known','relating to','premise','related to', 'lr4']
      for keyword in target_word:
        for i in range(len(df_premise)):
          text_premise = df_premise[i].lower()
          if keyword in text_premise:
            text_premise_append = df_premise[i:i+3]
            text_join = ' '.join(text_premise_append)
            premise_test.append(text_join)
            break
          else:
            i+=1
      premise_test = [*set(premise_test)] # Remove duplicate entry
      if premise_test == []:
        premise_test.append(find_premise(filepath_collection[rows]))
      df_cleaned_trimmed.loc[rows,'Premise'] = premise_test
      rows+=1
    # Append two columns landlord and tenant, drop original party column
    df_cleaned_trimmed['Landlord'] = Landlord
    df_cleaned_trimmed['Tenant'] = Tenant
    df_cleaned_trimmed.drop('Parties',axis = 1,inplace=True)
    # Further trimming
    df_cleaned_trimmed = remove_list_df(df_cleaned_trimmed)
    return df_cleaned_trimmed