Download libraries and dependencies

In [1]:
!pip install openai
!pip install tiktoken
!pip install PyPDF2
!pip install wordninja

from google.colab import drive
drive.mount('/content/drive')

import nltk
nltk.download('punkt')
nltk.download('stopwords')

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting wordninja
  Downloading

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Import libraries

In [2]:
import os
import re
import openai
import numpy as np
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize
from openai.embeddings_utils import get_embedding, cosine_similarity, get_embeddings
from nltk.corpus import stopwords
import glob
from timeit import default_timer as timer
import wordninja

Reading PDF files, Extracting text, and cleaning

In [3]:
def remove_empty_strings(text, sentences):
  for index, sent in enumerate(sentences):
    if len(sent.split(' '))==1:
      text.pop(index)
      sentences.pop(index)
  return sentences, text

def clean_text(text):

  text = text[:text.find('References')]
  text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', text)
  text = text.lower()
  regex_dict = {
    'reference_chapter_authors': (r'\[\d+(?:,\s*\d+)*\]'"|"r'\d+(?:\.\d+)*\.'"|"r'\b\w+\s+et al\.|\b\w+,?etal\.', ""),
    'next_line': ('\n', " "),
    'continued_line': ('- +', ""),
    'multiple_spaces': (' +', ' '),
    'pages': (r'(p|q)\s?\d+', ''),
    'figs': (r"fig\.(\s)?\d?|figs\.(\s)?\d?",'')
  }

  for key, value in regex_dict.items():
    text = re.sub(value[0], value[1], text)

  return text

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    text = clean_text(text)
    sentences = sent_tokenize(text)
    sentences_to_embed = []

    splitjoined = []

    for sent in sentences:
      splitjoined.append(wordninja.split(sent))

    cleaned = [' '.join(i) for i in splitjoined]

    whquestion = ['what', 'which', 'who', 'whom', 'there', 'when', 'where', 'why', 'how']
    stop_words = [i for i in stopwords.words('english') if i not in whquestion]
    for sent in cleaned:
      content = ""
      for word in sent.split(' '):
        if word not in stop_words:
          content = content + " " + word
      sentences_to_embed.append(content)

    return remove_empty_strings(sentences_to_embed, cleaned)

def get_files_content(path):
   files = glob.glob(f"{path}/*.pdf")

   all_sentences = []
   all_sent_embeds = []
   for file in files:
    sentences, text = extract_text_from_pdf(file)
    all_sentences.extend(sentences)
    all_sent_embeds.extend(text)
   return all_sentences, all_sent_embeds

Get embeddings for sentences from OpenAI API

In [4]:
def search_for_query(query, text, sentences):
  print("#extracted sentences: ", len(text))
  model = "text-embedding-ada-002"
  query_embedding = get_embedding(query, engine=model)
  text_embeddings = []

  for i in range(0, len(text), 500):
    text_embeddings.extend(get_embeddings(text[i:i+500],engine=model))

  similarities = [cosine_similarity(t, query_embedding) for t in text_embeddings]
  indicies = (-(np.array(similarities))).argsort()[:10]

  results = [sentences[i] for i in indicies]
  return results

Getting it, altogether

In [5]:
def main():
  os.environ['OPENAI_API_KEY'] = 'insert your openai api key here or in a .env file'
  openai.api_key = os.getenv('OPENAI_API_KEY')

  query = 'What are the effects of fake reviews?'
  start = timer()
  sents, sents_to_embed =  get_files_content('insert drive path of pdf files')
  print("reading time:","{:.2f}".format(timer()-start),"s")
  start = timer()
  results = search_for_query(query, sents_to_embed, sents)

  print("\nresults for query:\n")
  for result in results:
    print("•",result)
  print("\nembedding time:","{:.2f}".format(timer()-start),"s")

if __name__ == "__main__":
    main()

reading time: 38.53 s
#extracted sentences:  2611

results for query:

• in particular fake reviews undermine market efficacy and have a negative effect on social welfare
• to some extent the effects of fake reviews are decided by how a platform operates
• the final effects of fake reviews on the platforms are moderated by consumers ' attitudes toward fake reviews
• effects on various stakeholders fake reviews significantly affect various stakeholders such as consumers merchants and platforms
• on stakeholders fake reviews increase uncertainty and cause consumer distrust and the psychological discomfort whereas fake reviews directly increase purchase intentions increased distrust and psychological discomfort weaken purchase intentions and create negative word of mouth for products extant studies have not researched a consensus on the effects of fake reviews on product sales
• although fake reviews destroy the reputation of platforms the additional profits gained should be higher than t