# Document Analysis

In [None]:
!apt-get install libpoppler-cpp-dev

%pip install transformers
%pip install clean-text
%pip install openai
%pip install pdftotext

In [None]:
%cd /content/drive/MyDrive/Projects/ai-legacy

/content/drive/MyDrive/Projects/ai-legacy


In [None]:
import pandas as pd
import numpy as np
import re
import pdftotext

from transformers import pipeline

In [None]:
def read_pdf(path):
  pdf = pdftotext.PDF(open(path, 'rb'))

  return "".join(pdf)

text = read_pdf('papers/19900019299.pdf')
text

'                                                                                                   N90-28615\n                                 BLADE      TIP     RUBBING       STRESS       PREDICTION*\n               Gerald       A.     Brusher**,          Gary      A.    Davis,       and    Daniel       M.     Shea\n                           Rockwell         International/Rocketdyne                       Division\n                                             Canoga       Park,       California\n                                                           ABSTRACT\n      A   l_near       analysis         was     performed          to    determine         the     dynamic        response        of\na  turbine       blade       to    intermittent            rubbing         against       a   tip     seal.      The     response\nanalysis        consisted            of     a    parametric            study       where       the      rubbing         friction\nforce    was     assumed         t

In [None]:
from cleantext import clean

def clean_text(text):
  return clean(text,
      fix_unicode=True,               # fix various unicode errors
      to_ascii=True,                  # transliterate to closest ASCII representation
      lower=True,                     # lowercase text
      no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
      no_urls=False,                  # replace all URLs with a special token
      no_emails=False,                # replace all email addresses with a special token
      no_phone_numbers=False,         # replace all phone numbers with a special token
      no_numbers=False,               # replace all numbers with a special token
      no_digits=False,                # replace all digits with a special token
      no_currency_symbols=True,      # replace all currency symbols with a special token
      no_punct=True,                 # remove punctuations
      replace_with_punct="",          # instead of removing punctuations you may replace them
      replace_with_url="<URL>",
      replace_with_email="<EMAIL>",
      replace_with_phone_number="<PHONE>",
      replace_with_number="<NUMBER>",
      replace_with_digit="0",
      replace_with_currency_symbol="<CUR>",
      lang="en"                       # set to 'de' for German special handling
  )

text = clean_text(text)
text[:4000]

'n9028615 blade tip rubbing stress prediction gerald a brusher gary a davis and daniel m shea rockwell internationalrocketdyne division canoga park california abstract a lnear analysis was performed to determine the dynamic response of a turbine blade to intermittent rubbing against a tip seal the response analysis consisted of a parametric study where the rubbing friction force was assumed to vary as a half sine wave over a preselected contact arc the length of the contact arc as well as the pump speed was varied to determine the effects of each results show that for a given contact arc there are distinct critical speeds at which the blade response becomes a maximum introduction due to the requirement for high efficiency in modern turbomachinery clearances between turbine blade tips and the turbine housing are de sired to be as small as is practically possible an opposing design requirement is that the clearances be large enough to prevent rubbing of the blade tips on the housing due 

In [None]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

def summarize(text):
  tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

  model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

  inputs = tokenizer(text[:4000], return_tensors='pt')
  prediction = model.generate(**inputs)
  prediction = tokenizer.batch_decode(prediction)[0]
  prediction = prediction.replace('<s> ', '')
  prediction = prediction.replace('</s>', '..')

  return prediction.capitalize()

In [None]:
import os
import openai

def summarize(text):
  response = openai.Completion.create(
    engine="text-curie-001",
    prompt=f"Full text: {text[:4000]} Short summary:\n",
    temperature=0.3,
    max_tokens=100
  )

  return response.choices[0].text.replace('\n','')

summary = summarize(text)
summary

'A lnear analysis was performed to determine the dynamic response of a turbine blade to intermittent rubbing against a tip seal. The response analysis consisted of a parametric study where the rubbing friction force was assumed to vary as a half sine wave over a preselected contact arc. The length of the contact arc as well as the pump speed was varied to determine the effects of each. Results show that for a given contact arc there are distinct critical speeds at which the blade response becomes a maximum. This is'

In [None]:
def get_keywords(text):
  response = openai.Completion.create(
    engine="text-davinci-002",
    prompt=f"Text: {text} List of 3 keywords, separated by comma:",
    temperature=0.3,
    max_tokens=50,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  output = response.choices[0].text.replace(',','\n')
  output = output.replace(')', '')
  output = re.sub('[0-9.]', '', output)
  output = output.splitlines()
  return [keyword.strip() for keyword in output if (keyword and len(keyword) < 30)]

get_keywords(summary)

['system modeling', 'high speed bearings', 'cryogenics']

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def summarize_one_line(text):
  model = AutoModelForSeq2SeqLM.from_pretrained("snrspeaks/t5-one-line-summary")
  tokenizer = AutoTokenizer.from_pretrained("snrspeaks/t5-one-line-summary")

  input_ids = tokenizer.encode("summarize: " + text[:1000], return_tensors="pt", add_special_tokens=True)
  
  generated_ids = model.generate(input_ids=input_ids,num_beams=5,max_length=50,repetition_penalty=2.5,length_penalty=1,early_stopping=True,num_return_sequences=3)
  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

  return preds[0]

summarize_one_line(text)

'Some performance characteristics of divergent field bombardment thrusters by kaufman and vahrenkamp lewis research center cleveland ohio'

# Semantic Search

In [None]:
!pip install sentence-transformers

In [None]:
import scipy
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

# Paper Pipeline

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,n,title,id,file,document type,authors,date acquired,publication date,subject category,funding number
0,1,Blade Tip Rubbing Stress Prediction,19900019299,19900019299.pdf,Conference Paper,"Brusher, Gerald A. Davis, Gary A. Shea, Daniel M.","September 6, 2013","September 1, 1988",AIRCRAFT PROPULSION AND POWER,CONTRACT_GRANT: NAS8-36361
1,2,Evolution And Use Of Combined Mechanical And T...,19900019302,19900019302.pdf,Conference Paper,"Cody, Joe C. Marty, David E. Moore, James D.","September 6, 2013","September 1, 1988",MECHANICAL ENGINEERING,
2,3,"Adjustable Impedance, Force Feedback And Comma...",19900019691,19900019691.pdf,Conference Paper,"Sheridan, Thomas B. Raju, G. Jagganath. Buzan,...","September 6, 2013","January 31, 1989",MAN/SYSTEM TECHNOLOGY AND LIFE SUPPORT,
3,7,A Hybrid Architecture For The Implementation O...,19900019716,19900019716.pdf,Conference Paper,"Koutsougeras, C. Papachristou, C.","September 6, 2013","January 31, 1989",CYBERNETICS,
4,4,Methods And Strategies Of Object Localization,19900019704,19900019704.pdf,Conference Paper,"Shao, Lejun. Volz, Richard A.","September 6, 2013","January 31, 1989",MAN/SYSTEM TECHNOLOGY AND LIFE SUPPORT,


In [None]:
import json
import requests

papers = []

for index, row in df.iterrows():
  id = row['id']
  file = row['file']
  path = f'papers/{file}'

  pdf = read_pdf(path)
  text = clean_text(pdf)

  summary = summarize(text)
  keywords = get_keywords(summary)

  r = requests.get(f'https://ntrs.nasa.gov/api/citations/{id}')
  paper = r.json()
  paper['summary'] = summary
  paper['keywords'] = keywords

  print(summary)
  print(keywords)
  print('------------------')

  papers.append(paper)

The dynamic response of a turbine blade to intermittent rubbing against a tip seal was investigated using a parametric study. The response analysis consisted of a half-sine wave study where the rubbing friction force was assumed to vary as a half-sine wave over a preselected contact arc. The length of the contact arc and the pump speed were varied to determine the effects of each. Results show that for a given contact arc there are distinct critical speeds at which the blade response becomes a maximum. This
['turbine blade', 'intermittent rubbing', 'parametric study']
------------------
The development of system modeling capabilities for high speed bearings operating in cryogenics supports the overall MSFCbearing and material development program which is designed to formulate and experimentally verify failure mechanisms and life prediction models for high speed bearing shaft systems operating in cryogenics. The modeling effort supports the BSMTProgram and the development and improvemen

In [None]:
summaries = []

for paper in papers:
  summary = paper['summary']
  summaries.append(summary)

embeddings = model.encode(summaries)
embeddings.shape

(50, 768)

In [141]:
for i in range(len(embeddings)):
  distances = []

  for j in range(len(embeddings)):
    if i == j:
      continue

    distance = scipy.spatial.distance.cdist([embeddings[i, :]], [embeddings[j, :]], "cosine")[0]

    distances.append({
        'id': papers[j]['id'],
        'score': float(distance)
    })

  papers[i]['scores'] = distances
  papers[i]['embedding'] = list(embeddings[i, :].astype(float))

In [152]:
data = {}
data['documents'] = papers

with open('data.json', 'w') as f:
  json.dump(data, f)

In [122]:
for i in range(3):
  for j in range(3):
    distance = scipy.spatial.distance.cdist([embeddings[i, :]], [embeddings[j, :]], "cosine")[0]
    
    print(f'{i + 1} {j + 1} - {distance}')

1 1 - [0.]
1 2 - [0.6003964]
1 3 - [0.74647799]
2 1 - [0.6003964]
2 2 - [1.11022302e-16]
2 3 - [0.78185406]
3 1 - [0.74647799]
3 2 - [0.78185406]
3 3 - [0.]


# Find xyz coordinates for every node using autodiff

In [None]:
start = 1
distances = []

for i in range(50):
  for j in range(start, 50):
    distance = scipy.spatial.distance.cdist([embeddings[i, :]], [embeddings[j, :]], "cosine")[0]
    distances.append(float(distance))

    print(f'{i + 1} {j + 1} - {distance}')

  start += 1


In [None]:
import tensorflow as tf

tensors = []

for i in range(50):
  tensors.append(tf.Variable(tf.random.uniform([3],minval=0), dtype=tf.float32))

y = tf.constant(distances)

tensors

In [None]:
learning_rate = 1.0

for epoch in range(500):
  start = 1

  with tf.GradientTape() as tape:
    distances = []

    for i in range(len(tensors)):
      for j in range(start, len(tensors)):
        distance = tf.sqrt(tf.reduce_sum(tf.square(tensors[i] - tensors[j])))
        distances.append(distance)

      start += 1

    loss = tf.reduce_mean(tf.square(distances - y))

    if epoch % 10 == 0:
      print(loss)

  grads = tape.gradient(loss, tensors)

  for i in range(len(grads)):
    tensors[i].assign(tensors[i] - grads[i] * learning_rate)

In [392]:
distances = []
start = 1

for i in range(len(tensors)):
  for j in range(start, len(tensors)):
    distance = tf.sqrt(tf.reduce_sum(tf.square(tensors[i] - tensors[j])))
    distances.append(distance)

  start += 1

In [None]:
for i in range(len(distances)):
  print(distances[i])
  print(y[i])
  print()

In [400]:
for tensor in tensors:
  print(tensor.numpy())

[ 0.12492618 -0.06804582  0.27753368]
[ 0.3583211  -0.08433492  0.6218014 ]
[0.0252158  0.3560477  0.38003358]
[-0.18174738  0.4888676   0.4195969 ]
[-0.03924221  0.95302296  0.69959134]
[0.06036701 1.007531   0.42333362]
[-0.12252923  0.52279544  0.16437641]
[-0.06368641  0.7791059   0.24993457]
[0.2125518 1.0553161 0.7984142]
[-0.10103215  0.7381484   0.57547957]
[0.05990313 1.0872567  0.28969258]
[0.9417694  0.22285506 0.38283026]
[1.1053083 0.5048518 0.2631152]
[1.0631696  0.27855793 0.42589557]
[0.7314057  0.01719058 0.11060737]
[-0.12866162  0.38519153  0.7248101 ]
[0.5548065 1.0224268 0.2884577]
[0.8503242  0.8424246  0.99853265]
[ 0.58372027  0.35632268 -0.01998488]
[0.56519276 1.0783802  0.54172343]
[0.80024713 0.82034105 0.09542564]
[ 0.41463062  0.6879742  -0.04854409]
[0.7122417  0.976044   0.47170034]
[ 0.6923146  0.5903559 -0.0455824]
[ 0.56624967 -0.02770462  0.96538556]
[0.8088235  0.49161625 1.0100676 ]
[0.35371205 0.20468804 0.96426797]
[0.44893855 1.0142657  1.025103