#**PAPERPARSER APPLICATION**

Mount Google Drive and set the paperParser folder as working directory

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
#!ls
import os
os.chdir( os.path.join(os.getcwd(),'gdrive/MyDrive' ))
#!ls
paperParserDir = os.path.join(os.getcwd(),"paperParserProgram")
print('paperParser directory:')
print(paperParserDir )
os.chdir(paperParserDir)

***PART 1: PREPARE THE DATA***

In [None]:
#indicate revision
rev='rev9'
paperParserVersion='scraping_'+rev

In [None]:
scrapyDir = os.path.join(paperParserDir,paperParserVersion)
print('scrapy directory:')
print(scrapyDir)
os.chdir(scrapyDir)
!ls
SciLitDir = os.path.join(scrapyDir, "SciLit")

In [None]:
import shutil
os.chdir(scrapyDir)
source_dir = os.path.join(scrapyDir,'abstracts')
destination_dir = SciLitDir
shutil.copytree(source_dir, destination_dir)

In [None]:
import glob
folder_path='mainTexts'
txt_files = glob.glob(folder_path + "/*.txt")

Grant access to HuggingFace repository

In [None]:
import google.colab
from google.colab import userdata
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
LLMsDir = os.path.join(scrapyDir,'LLMs')
print('LLMs directory:')
print(LLMsDir)
os.chdir(LLMsDir)
!ls

In [None]:
!pip install fpdf==1.7.2
!python printFiles.py
from printFiles import (printPDF, printTXT)
!python getTexts.py
from getTexts import (read_content, get_mod_time, merge_per_folder, test_getTexts, mergeSummaries)

In [None]:
!pip install -qU bitsandbytes triton
import bitsandbytes, triton

In [None]:
!python loadLLM.py
from loadLLM import loadLLM
!python nlp_sciLit.py
from nlp_sciLit import nlp_sciLit

In [None]:
model, tokenizer = loadLLM(llm="Uni-SMART/SciLitLLM1.5-14B")

In [None]:
os.chdir(scrapyDir)
import re
for file in txt_files:
  print(file)
  text=read_content(file)
  response = nlp_sciLit(text, model, tokenizer)
  output_filename=re.sub(r'[\w]+/', '', file)
  SciLit_file=os.path.join('SciLit', output_filename)
  with open( SciLit_file, "a") as filehandler:
    filehandler.write("\n\nMain text summary\n\n")
    filehandler.write(response)
    filehandler.close()


Move everything to new 'RESULTS' folder

In [None]:
os.mkdir(os.path.join(paperParserDir,'RESULTS'))
os.chdir(paperParserDir)
!mv -i "${PWD}/scraping_rev9/SciLit" "${PWD}/RESULTS"

In [None]:
!mv -i "${PWD}/scraping_rev9/abstracts" "${PWD}/RESULTS"
!mv -i "${PWD}/scraping_rev9/articlesSummaries" "${PWD}/RESULTS"
!mv -i "${PWD}/scraping_rev9/mainTexts" "${PWD}/RESULTS"

#***PART 2: TEXT GENERATION BY ARTIFICIAL INTELLIGENCE ALGORITHMS***

Merge the summaries to create the input for algorithm of generative artificial intelligence

In [None]:
os.chdir(LLMsDir)

!python connectOpenAI.py
from connectOpenAI import connectOpenAI
!python generateFramework.py
from generateFramework import generateFramework

In [None]:
from google.colab import userdata
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["Organization_ID"] = userdata.get('Organization_ID')
os.environ["PROJECT_ID"] = userdata.get('PROJECT_ID')
client = connectOpenAI(api_key=os.environ["OPENAI_API_KEY"], organization = os.environ["Organization_ID"], project= os.environ["PROJECT_ID"] )
print("openAI client:")
print(client)

In [None]:
resultsDir=os.path.join(paperParserDir,'RESULTS')

In [None]:
#generate inputs for AI application
suffixes=("_direct","_reverse")
AI_inputs= [AI_input_direct, AI_input_reverse]= mergeSummaries(outputDir=resultsDir, inputDir= os.path.join(resultsDir,'SciLit'), suffixes= suffixes)


**Generation of drafts' frameworks with prompt engineering**

Prompt engineering is the process of structuring or crafting an instruction in order to produce the best possible output from a generative artificial intelligence (AI) model.


Genkina, Dina (March 6, 2024). "AI Prompt Engineering is Dead: Long live AI prompt engineering". IEEE Spectrum. Retrieved January 18, 2025.

In [None]:
name="framework"
frameworks=[generateFramework(client=client, AI_input=AI_inputs[0],resultsDir=resultsDir, output_filename=name+suffixes[0]+".txt"),
        generateFramework(client=client, AI_input=AI_inputs[1], resultsDir=resultsDir, output_filename=name+suffixes[1]+".txt")
        ]

##***GENERATE DRAFTS BY REPL AND RAG***

REPL is an acronym for Read, Evaluate, Print, and Loop. Developers use REPL Python to communicate with the Python Interpreter.

Ellis, Kevin, et al. “Write, Execute, Assess: Program Synthesis with a REPL.” ArXiv.org, 2019, arxiv.org/abs/1906.04604.

RAG (Retrieval-Augmented Generation) is the process of optimizing the output of a large language model, so it references an authoritative knowledge base outside of its training data sources before generating a response.

Lewis, Patrick, et al. “Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks.” ArXiv.org, 12 Apr. 2021, arxiv.org/abs/2005.11401.

Install langchain modules

In [None]:
!pip install langchain_huggingface
!pip install langchain_openai
!pip install langchain_community
!pip install langchain_experimental
!pip install -qU langchain-community faiss-cpu

In [None]:
#os.chdir(LLMsDir)
!python selectTools.py
from selectTools import (selectModel, selectREPLagent)
!python manageParagraphs.py
from manageParagraphs import (identifyParagraphs, retrieveParagraphContent)
!python buildRetriever.py
from buildRetriever import (getRetriever, format_docs, get_text_chunks_from_str)
!python generateExtensions.py
from generateExtensions import (generateAbstract, extendParagraphs, addReferences)
!python elongateFrameworks.py
from elongateFrameworks import elongateFrameworks
!python mergeDrafts.py
from mergeDrafts import (merge2Drafts, mergingOp)

In [None]:
!pip install huggingface_hub
!pip install hf-xet

Extend the paragraphs composing the frameworks

In [None]:
drafts= elongateFrameworks(frameworks, resultsDir, AI_inputs, suffixes, api_key=os.environ["OPENAI_API_KEY"], organization = os.environ["Organization_ID"])


Merge two drafts

In [None]:
drafts_merged= merge2Drafts(drafts,resultsDir, api_key=os.environ["OPENAI_API_KEY"], organization = os.environ["Organization_ID"])

In [None]:
name="drafts_merged"
printPDF(name,drafts_merged, resultsDir)

#***PART 3: HUMANIZE AI-GENERATED TEXT***

In [None]:
drafts_merged=read_content(os.path.join(paperParserDir,'RESULTS',"drafts_merged.txt"))

In [None]:
examples=read_content(os.path.join(paperParserDir,"examples.txt"))
system_content=read_content(os.path.join(paperParserDir,"system_content.txt"))
user_instructions=read_content(os.path.join(paperParserDir,"user_instructions.txt"))

In [None]:
os.chdir(LLMsDir)
!python humanizeTexts.py
from humanizeTexts import (humanize, humanizeOp, callHumanizingApp)

In [None]:
email= None
pwd= None
humanizedDraft, paragraphsDir =humanize(examples, drafts_merged, system_content, user_instructions, resultsDir, api_key=os.environ["OPENAI_API_KEY"], organization = os.environ["Organization_ID"],email=email, pwd=pwd)

In [None]:
humanizedDraft=read_content(os.path.join(paperParserDir,'RESULTS',"humanized.txt"))

In [None]:
if email is None or pwd is None:
	name="humanized"
else:
	name="humanized_antiDetector"
printPDF(name,humanizedDraft, resultsDir)