# Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/ContractLegitt/

/content/drive/MyDrive/ContractLegitt


# Installing and Importing Libraries


In [None]:
!pip install torch transformers numpy pandas matplotlib seaborn pyarrow -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m800.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.2/124.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [None]:
!pip install -U sentence-transformers -q

In [None]:
!pip install python-docx PyPDF2 -q

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch
import json
import docx
import PyPDF2

# Tokenization

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained('./cuad-models/roberta-large/')
tokenizer = AutoTokenizer.from_pretrained('./cuad-models/roberta-large/', use_fast=False)

In [None]:
with open('./cuad-data/CUADv1.json') as json_file:
    data = json.load(json_file)

In [None]:
question = data['data'][0]['paragraphs'][0]['qas'][2]['question']
paragraph = ' '.join(data['data'][0]['paragraphs'][0]['context'].split()[:100])

In [None]:
encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)
inputs = encoding['input_ids']
tokens = tokenizer.convert_ids_to_tokens(inputs)
outputs = model(input_ids=torch.tensor([inputs]))

start_scores = outputs.start_logits
end_scores = outputs.end_logits

# Graphical Representation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
#sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (16,8)

# Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
s_scores = start_scores.detach().numpy().flatten()
e_scores = end_scores.detach().numpy().flatten()

# We'll use the tokens as the x-axis labels. In order to do that, they all need
# to be unique, so we'll add the token index to the end of each one.
token_labels = []
for (i, token) in enumerate(tokens):
    token_labels.append('{:} - {:>2}'.format(tokenizer.convert_tokens_to_string(token), i))

# Create a barplot showing the start word score for all of the tokens.
ax = sns.barplot(x=token_labels[80:120], y=s_scores[80:120], ci=None)

# Turn the xlabels vertical.
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")

# Turn on the vertical grid to help align words to scores.
ax.grid(True)

plt.title('Start Word Scores')

plt.show()

NameError: ignored

In [None]:
# Create a barplot showing the end word score for all of the tokens.
ax = sns.barplot(x=token_labels[80:120], y=e_scores[80:120], errorbar=None)

# Turn the xlabels vertical.
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")

# Turn on the vertical grid to help align words to scores.
ax.grid(True)

plt.title('End Word Scores')

plt.show()


# Making Predictions

In [None]:
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)
answer = tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
answer.strip()

'7th day of September, 1999.'

In [None]:
from scripts.predict import run_prediction

In [None]:
def convert_pdf_to_txt(pdf_filename, txt_filename):
    # Open the PDF file in read-binary mode
    with open(pdf_filename, 'rb') as pdf_file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Create or open the TXT file for writing
        with open(txt_filename, 'w', encoding='utf-8') as txt_file:
            # Iterate through pages and extract text
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                txt_file.write(page.extract_text())

# Provide the input PDF and output TXT filenames
input_pdf = 'SampleContract.pdf'
output_txt = 'output.txt'

# Call the function to perform the conversion
convert_pdf_to_txt(input_pdf, output_txt)

print(f"PDF file '{input_pdf}' has been converted to TXT file '{output_txt}'.")


PDF file 'SampleContract.pdf' has been converted to TXT file 'output.txt'.


In [None]:
with open('./cuad-data/CUADv1.json') as json_file:
    data = json.load(json_file)

In [None]:
questions = []
for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']):
    question = data['data'][0]['paragraphs'][0]['qas'][i]['question']
    questions.append(question)
# contract = data['data'][0]['paragraphs'][0]['context']

In [None]:
# with open('output.txt', 'w') as f:
#     f.write(' '.join(contract.split()))

In [None]:
def convert_docx_to_txt(docx_filename, txt_filename):
    # Load the DOCX file
    doc = docx.Document(docx_filename)

    # Create or open the TXT file for writing
    with open(txt_filename, 'w', encoding='utf-8') as txt_file:
        # Iterate through paragraphs in the DOCX document and write to TXT file
        for paragraph in doc.paragraphs:
            txt_file.write(paragraph.text + '\n')

# Provide the input DOCX and output TXT filenames
input_docx = 'input.docx'
output_txt = 'output.txt'

# Call the function to perform the conversion
# convert_docx_to_txt(input_docx, output_txt)

# print(f"DOCX file '{input_docx}' has been converted to TXT file '{output_txt}'.")


In [None]:
def read_text_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Provide the filename of the text file you want to read
filename = 'output.txt'

# Call the function to read the file and store its content as a string
file_content = read_text_file(filename)

# Print the content of the file as a string
file_content


' \n \n \n \nINTERIOR FITOUT AND MEP WORKS  \nFOR  \nPURE HEALTH OFFICE AT 41ST FLOOR,  \nVISION TOWER, DUBAI , UAE  \n \n \nCONTRACT  DOCUMENTS  \n \n \nVOLUME 1 \n \nTENDER AND CONTRACT REQUIREMENTS   \n \n \n \n \n \n \n \nEmployer:  \nPURE HEALTH MEDICAL SUPPLIES L.L.C  \n3401 , Vision Tower, Business Bay  \nP.O. Box 283572, Dubai, UAE  Contractor:  \nCONCEPT INTERIORS  \nP.O. Box 66020  \nSharjah , UAE  \nTel: 0 6-536 9842  \n \n \nMarch  2022 \n    \nPURE HEALTH OFFICE AT  41ST FLOOR , VISION TOWER, DUBAI - UAE  \n \nINTERIOR FITOUT AND MEP WORKS PACKAGE  \n \n \nVOLUME 1 \n \nTENDER AND CONTRACT REQUIREMENTS  \n \n \nCONTENTS  \n \n \n                                                                                                                                          PAGE  \n \nSECTION 1  INSTRUCTIONS TO TENDERERS  S1/1–S1/7 \n \nSECTION 2  FORM OF T ENDER & APPENDIX  S2/1–S2/5  \n \nSECTION 3  CONTRACT AGREEMENT  & APPENDIX  S3/1–S3/5  \n \nSECTION 4  CONDITIONS OF CONTRACT 

In [None]:
predictions = run_prediction(questions, file_content, 'cuad-models/roberta-large/')

convert squad examples to features: 100%|██████████| 41/41 [05:10<00:00,  7.56s/it]
add example index and unique id: 100%|██████████| 41/41 [00:00<00:00, 7188.33it/s]


In [None]:
with open('predictions.txt', 'w') as f:
    for i, p in enumerate(predictions):
        f.write(f"Question {i+1}: {questions[int(p)]}\nAnswer: {predictions[p]}\n\n")

In [None]:
pred = dict(predictions)

In [None]:
pred

{'0': 'CONTRACT AGREEMENT',
 '1': 'PURE HEALTH MEDICAL SUPPLIES LLC 3401 Visio n Tower , Business Bay, P.O. Box 283572 , Dubai , UAE (hereinafter called the “Employer”)',
 '2': '13th January 2022',
 '3': '13th January 2022',
 '4': '',
 '5': 'This Tender Bond is valid for 90 calendar days from ……………….. {date of tender submission} to ……………… and renewable before expiry for a further 30 calendar days if requested in writing by the Employer .',
 '6': 'This Tender Bond is valid for 90 calendar days from ……………….. {date of tender submission} to ……………… and renewable before expiry for a further 30 calendar days if requested in writing by the Employer .',
 '7': 'The Contract shall be governed by the laws of the Emirate of Dubai and the federal laws of the United Arab Emirates.',
 '8': 'The Employer shall not bi nd himself to accept the lowest or any Tender and shall not state a reason for the acceptance or rejection of a Tender.',
 '9': '',
 '10': 'The Contractor shall not have exclusive use of t

# Rating and Ranking

In [None]:
print(type(predictions))
predictions.keys()

<class 'collections.OrderedDict'>


odict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40'])

In [None]:
preDict = dict(predictions)

In [None]:
with open('label.json') as json_file:
    labels = json.load(json_file)

In [None]:
context = []
for k,v in preDict.items():
  context.append(v)

In [None]:
# context[2]

In [None]:
# labels

In [None]:
# type(labels.values())

In [None]:
clause = list(labels.keys())
# clause

In [None]:
keywords = list(labels.values())
# keywords

In [None]:
from sentence_transformers import SentenceTransformer, util
modelSentence = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def cosineExtract ():
  tempval = []
  cos_dict = {}
  cos_raw = {}
  for i in range (0,41):
    c = keywords[i]
    t = context[i]
    # print(c,'\n',t)
    temb = modelSentence.encode(t)
    cemb = modelSentence.encode(c)
    cosval = util.cos_sim(temb,cemb)
    cosvallist = cosval.tolist()
    for j in cosvallist[0]:
      if j>0.3:
        tempval.append(j)
    avg_cosval = sum(tempval)/len(tempval)
    cos_dict.update({clause[i]:avg_cosval})
    cos_raw.update({clause[i]:cosvallist})
  return cos_dict,cos_raw
cos_value_dict,cos_raw = cosineExtract()


In [None]:
# cos_raw

In [None]:
# cos_value_dict

{'Document Name': 0.580495278040568,
 'Parties': 0.580495278040568,
 'Agreement Date': 0.5058277726173401,
 'Effective Date': 0.4733454372201647,
 'Expiration Date': 0.4733454372201647,
 'Renewal Term': 0.44639161560270524,
 'Notice Period To Terminate Renewal': 0.44639161560270524,
 'Governing Law': 0.4160101115703583,
 'Most Favored Nation': 0.4160101115703583,
 'Non-Compete': 0.40046443144480387,
 'Exclusivity': 0.3959304094314575,
 'No-Solicit Of Customers': 0.3959304094314575,
 'Competitive Restriction Exception': 0.3959304094314575,
 'No-Solicit Of Employees': 0.3959304094314575,
 'Non-Disparagement': 0.3959304094314575,
 'Termination For Convenience': 0.3959304094314575,
 'Rofr/Rofo/Rofn': 0.3959304094314575,
 'Change Of Control': 0.39590704791686115,
 'Anti-Assignment': 0.39565494813417135,
 'Revenue/Profit Sharing': 0.3912798762321472,
 'Price Restriction': 0.3905392418736997,
 'Minimum Commitment': 0.3905392418736997,
 'Volume Restriction': 0.3905392418736997,
 'Ip Ownership 

In [None]:
lrc = ['Document Name',
  'Parties',
  'Agreement Date',
  'Effective Date',
  'Expiration Date',
  'Renewal Term',
  'Notice Period To Terminate Renewal',
  'No-Disparagement',
  'Warranty Duration',
  'Insurance',
  'Covenant Not To Sue',
  'Third Party Beneficiary']
mrc =  ['Termination For Convenience',
  'Post-Termination Services',
  'Revenue/Profit Sharing',
  'Minimum Commitment',
  'Volume Restriction',
  'Ip Ownership Assignment',
  'Joint Ip Ownership',
  'License Grant',
  'Non-Transferable License',
  'Source Code Escrow']
hrc =  ['Uncapped Liability',
  'Liquidated Damages',
  'Non-Compete',
  'Exclusivity',
  'No-Solicit Of Customers',
  'No-Solicit Of Employees',
  'Irrevocable Or Perpetual License',
  'Anti-Assignment',
  'Change Of Control',
  'Audit Rights',
  'Most Favored Nation',
  'Competitive Restriction Exception',
  'Rofr/Rofo/Rofn',
  'Affiliate License-Licensor',
  'Affiliate License-Licensee',
  'Governing Law']

In [None]:
riskval = 0
for k,v in cos_value_dict.items():
  if k in lrc and v>0.3:
    riskval = riskval + 0.3*v
  if k in mrc and v>0.4:
    riskval = riskval + 0.6*v
  if k in hrc and v>0.5:
    riskval = riskval + 0.9*v
if riskval>11:
  riskval = 10
if riskval in range (10,11):
  riskval = 9.5

In [None]:
riskval_fin = round(riskval,2)
def risk_factor(risk):
    if risk >= 1 and risk <= 2:
        return "NoRisk"
    elif risk >= 2 and risk <= 4:
        return "LowRisk"
    elif risk >= 4 and risk <= 6:
        return "AvgRisk"
    elif risk >= 6 and risk <= 8:
        return "HighRisk"
    elif risk >= 8 and risk <= 10:
        return "ExtremeRisk"
    else:
        return "InvalidRisk"
risktype = risk_factor(riskval_fin)
print(risktype, riskval_fin)

NoRisk 1.55


# Excel Export

In [None]:
import openpyxl
workbook = openpyxl.load_workbook('CAD reference.xlsx')

In [None]:
workbook.sheetnames

['Cover',
 'Index',
 'Scope',
 'Salient Features',
 'Access To Site',
 'Key Dates',
 'Important Submittals',
 'Submittals Pricing Docs',
 'Submission Format',
 'Insurance',
 'Payment',
 'Price Adjustment',
 'Variation',
 'Taxation',
 'Notices',
 'EOT,COST Clause',
 'Res n Duties-Contractor',
 'Res n Duties-EmployeeEngineer',
 'Default of Contractor Employee',
 'General',
 'Employer Claim',
 'Claim Proced',
 'Dispute Settlement']

In [None]:
KeyDate = workbook['Key Dates']
ImpSubmittals = workbook['Important Submittals']
Insurance = workbook['Insurance']
Payment = workbook['Payment']
PriceAdj = workbook['Price Adjustment']
Variation = workbook['Variation']
Taxation = workbook['Taxation']
Notices = workbook['Notices']
RnDofContractor = workbook['Res n Duties-Contractor']
RnDofEE = workbook['Res n Duties-EmployeeEngineer']
DefCE = workbook['Default of Contractor Employee']
General = workbook['General']
EmployerClaim = workbook['Employer Claim']
ProcOfClaims = workbook['Claim Proced']
DisputeSettle = workbook['Dispute Settlement']

In [None]:
columnname = []
for cell in KeyDate[1]:
  columnname.append(cell.value)
columnname

['Rating',
 'Key Dates',
 'Description',
 'Number of Days from Commencement',
 'Completion Date',
 'Delay Damages for non achievement of key Dates',
 'Remarks']

In [None]:
with open('ClauseClass.json') as json_file:
    ClauseClass = json.load(json_file)

In [None]:
ClauseClass.items()

dict_items([('Key_Dates', ['Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Notice Period To Terminate Renewal']), ('Important_Submittals', ['Document Name']), ('Insurance_Related', ['Insurance']), ('Payment', ['Revenue/Profit Sharing', 'Minimum Commitment']), ('Price_Adjustment', ['Volume Restriction']), ('Variation', ['Ip Ownership Assignment', 'Joint Ip Ownership', 'License Grant', 'Non-Transferable License', 'Source Code Escrow']), ('Taxation', ['Uncapped Liability']), ('Notices', ['No-Disparagement', 'Covenant Not To Sue', 'Termination For Convenience', 'Post-Termination Services']), ('Responsibilities_and_Duties_of_Contractor', ['No-Solicit Of Customers', 'No-Solicit Of Employees']), ('Responsibilities_and_Duties_of_Employer_and_Engineer', ['Warrant Duration']), ('Default_of_Contractor_and_Employers', ['Liquidated Damages']), ("Employer's_Claims", ['Audit Rights']), ('Procedures_for_Claims', ['Dispute Settlement']), ('Dispute_Settlement', ['Most Favored Nat

In [None]:
m = list(cos_raw.keys())
n = list(pred.values())

In [None]:
CnD = dict(zip(m, n))

In [None]:
Key_Dates = ["Agreement Date", "Effective Date", "Expiration Date", "Renewal Term" ,"Notice Period To Terminate Renewal"]
Important_Submittals = ["Document Name"]
Insurance_Related = ["Insurance"]
PaymentClause = ["Revenue/Profit Sharing", "Minimum Commitment"]
Price_Adjustment = ["Volume Restriction"]
VariationClause = ["Ip Ownership Assignment","Joint Ip Ownership","License Grant","Non-Transferable License","Source Code Escrow"]
TaxationClause = ["Uncapped Liability"]
NoticeClause = ["No-Disparagement","Covenant Not To Sue","Termination For Convenience","Post-Termination Services"]
Responsibilities_and_Duties_of_Contractor = ["No-Solicit Of Customers","No-Solicit Of Employees"]
Responsibilities_and_Duties_of_Employer_and_Engineer = ["Warrant Duration"]
Default_of_Contractor_and_Employers = ["Liquidated Damages"]
Employer_Claims = ["Audit Rights"]
Procedures_for_Claims = ["Dispute Settlement"]
Dispute_Settlement = ["Most Favored Nation","Competitive Restriction Exception","Rofr/Rofo/Rofn"]
GeneralClause = ["Third Party Beneficiary","Governing Law"]

In [None]:
CnD.keys()

dict_keys(['Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Notice Period To Terminate Renewal', 'Governing Law', 'Most Favored Nation', 'Non-Compete', 'Exclusivity', 'No-Solicit Of Customers', 'Competitive Restriction Exception', 'No-Solicit Of Employees', 'Non-Disparagement', 'Termination For Convenience', 'Rofr/Rofo/Rofn', 'Change Of Control', 'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restriction', 'Minimum Commitment', 'Volume Restriction', 'Ip Ownership Assignment', 'Joint Ip Ownership', 'License Grant', 'Non-Transferable License', 'Affiliate License-Licensor', 'Affiliate License-Licensee', 'Unlimited Or All-you-can-eat-License', 'Irrevocable Or Perpetual License', 'Source Code Escrow', 'Post-Termination Services', 'Audit Rights', 'Uncapped Liability', 'Cap-on Liability', 'Liquidated Damages', 'Warranty Duration', 'Insurance', 'Covenant Not To Sue', 'Third Party Beneficiary'])

In [None]:
index_dict = {}
for index, key in enumerate(CnD):
    index_dict[index] = key

In [None]:
for k, v in CnD.items():
  i=''
  if k in Key_Dates:
    addon = [100*cos_value_dict[k],v,k]
    KeyDate.append(addon)
  elif k in Important_Submittals:
    addon = [100*cos_value_dict[k],k,v]
    ImpSubmittals.append(addon)
  elif k in Insurance_Related:
    addon = [100*cos_value_dict[k],k,v]
    Insurance.append(addon)
  elif k in PaymentClause:
    addon = [100*cos_value_dict[k],k,v]
    Payment.append(addon)
  elif k in Price_Adjustment:
    addon = [100*cos_value_dict[k],k,v]
    PriceAdj.append(addon)
  elif k in VariationClause:
    addon = [100*cos_value_dict[k],k,v]
    Variation.append(addon)
  elif k in TaxationClause:
    addon = [100*cos_value_dict[k],k,v]
    Taxation.append(addon)
  elif k in NoticeClause:
    addon = [100*cos_value_dict[k],k,'','',v]
    Notices.append(addon)
  elif k in Responsibilities_and_Duties_of_Contractor:
    addon = [100*cos_value_dict[k],k,v]
    RnDofContractor.append(addon)
  elif k in Responsibilities_and_Duties_of_Employer_and_Engineer:
    addon = [100*cos_value_dict[k],k,'',v]
    RnDofEE.append(addon)
  elif k in Default_of_Contractor_and_Employers:
    addon = [100*cos_value_dict[k],k,v]
    DefCE.append(addon)
  elif k in Employer_Claims:
    addon = [100*cos_value_dict[k],k,v]
    EmployerClaim.append(addon)
  elif k in Procedures_for_Claims:
    addon = [100*cos_value_dict[k],k,v]
    ProcOfClaims.append(addon)
  elif k in Dispute_Settlement:
    addon = [100*cos_value_dict[k],k,v]
    DisputeSettle.append(addon)
  elif k in GeneralClause:
    addon = [100*cos_value_dict[k],k,v]
    General.append(addon)
  else:
    None

In [None]:
workbook.save('final.xlsx')