In [1]:
import pandas as pd
from tqdm import tqdm
import psycopg2
import time 

import pandas as pd
import os
import numpy as np

from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
f = open('/home/fs01/spec1142/Emma/GateKeepers/' + "database.txt", "r")
user , password = f.read().split()

main_path = '/home/fs01/spec1142/Emma/GateKeepers/Text_encoding/'


## Create table to store patent's embeddings 

In [3]:
## create table 
table_name = "encoded_patents_PatentsView"

## define columns of the table
schema = """CREATE TABLE  encoded_patents_PatentsView ( 
   patent_id VARCHAR(15) ,
   encoded_title text,
   encoded_abstract text );"""


def create_table(table_name, schema, drop_if_exists):

    """
    This function creates a table in a PostgreSQL database with the specified schema, and drops the table if it already exists.

    Parameters:
    table_name (str): The name of the table to be created in the PostgreSQL database.
    schema (str): The SQL schema defining the structure of the table.
    drop_if_exists (bool): A flag indicating whether to drop the table if it already exists before creating it.

    Returns:
    str: A success message if the table is created successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function commits the transaction and closes the database connection after creating the table.
    """

    conn = psycopg2.connect("user=" + user + " password=" + password)
    cursor = conn.cursor()

    if drop_if_exists == True:
        cursor.execute("DROP TABLE IF EXISTS " + table_name)
    
    #Creating table as per requirement
    sql = schema
    
    cursor.execute(sql)
    conn.commit()

    #Closing the connection
    conn.close()

    return "Table created successfully........"



## Get patents not encoded yet

In [None]:
## query from the database the patent id not encoded yet (not in the table encoded_patents_PatentsView, but already in the table  patents_PatentsView)

#establishing the connection
conn = psycopg2.connect("user=" + user + " password=" + password)

#Creating a cursor object using the cursor() method
cursor = conn.cursor()


#Creating table as per requirement
sql ="""SELECT patent_id 
        FROM   patents_PatentsView AS p 
        WHERE  NOT EXISTS (
           SELECT patent_id
           FROM   encoded_patents_PatentsView
           WHERE  patent_id = p.patent_id
           );"""

cursor.execute(sql)
result = cursor.fetchall()

patent_ids = [ elem[0] for elem in result ]


#Closing the connection
conn.close()

## Encode patent's abstracts and titles

In [None]:
import pandas as pd
import os
import numpy as np

from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
import tqdm as notebook_tqdm
import time


def encoding(df):

    if __name__ == '__main__':
        
                
        start = time.time()
    

        logging.basicConfig(format='%(asctime)s - %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.INFO,
                            handlers=[LoggingHandler()])

        os.environ["TOKENIZERS_PARALLELISM"] = "false"


        ## abstract 

        #abstract exists
        df1 = df[df["patent_abstract"].notnull()][["patent_id" , "patent_title" , "patent_abstract"]]


        #Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.

        end = time.time()
        print("start abstract" , end - start)

        #Create a large list of 100k sentences
        sentences = df1["patent_abstract"].tolist()


        #Define the model
        model = SentenceTransformer('all-MiniLM-L6-v2'  )

        #Start the multi-process pool on all available CUDA devices
        pool = model.start_multi_process_pool(target_devices = ['cpu' for i in range(4) ])

        #Compute the embeddings using the multi-process pool
        emb = model.encode_multi_process(sentences, pool, batch_size=128 , chunk_size = 2500 )
        end = time.time()
        print("end abstract" , end - start)

        df1["encoded_abstract"] = [ np.around(emb[j] , decimals = 3) for j in range(len(df1)) ] 
        print("Embeddings abstract computed. Shape:", emb.shape)
        print(" ")
        # title
        end = time.time()
        print("start title" , end - start)
        #Create a large list of 100k sentences
        
        sentences = df1["patent_title"].tolist()
        #Define the model
        #model = SentenceTransformer('all-MiniLM-L6-v2'  )
        #Start the multi-process pool on all available CUDA devices
        #pool = model.start_multi_process_pool(target_devices = ['cpu' for i in range(4) ])
        #Compute the embeddings using the multi-process pool
        emb = model.encode_multi_process(sentences, pool, batch_size=128 , chunk_size = 2500 )
        end = time.time()
        print("end title" , end - start)
        df1["encoded_title"] = [ np.around(emb[j] , decimals = 3) for j in range(len(df1)) ] 
        print("Embeddings title computed. Shape:", emb.shape)
        print(" ")
       
        df1[["patent_id" , "encoded_title" , "encoded_abstract"]].to_csv(main_path + 'encoded_Patents_june23.tsv' , sep='\t' , mode = "a" , index  = False )
         
        print("file saved")
       
    

In [None]:
## get titles and abstracts of the patents on encoded yet

patents_patentsview = pd.read_csv(main_path +"g_patent.tsv" , delimiter = "\t" )
patents_patentsview["patent_id"] = patents_patentsview["patent_id"].astype("str")
data = patents_patentsview[patents_patentsview["patent_id"].isin(patent_ids)]


In [None]:
## encode the file by chunks (for memory issues)

size = 25000

for k in range(int(len(data)/size)+1):
    print(k)

    df = data[k*size:(k+1)*size]
    encoding(df)

## Load embedding into the database 

In [4]:
## load data into the table 

## input path to the data and table name 
data_path = main_path + 'encoded_Patents_june23.tsv'
table_name = "encoded_patents_PatentsView"


def load_data_into_table(table_name, data_path):

    """
    This function loads data from a TSV file into a specified table in a PostgreSQL database.

    Parameters:
    table_name (str): The name of the table in the PostgreSQL database to load data into.
    data_path (str): The file path of the TSV file containing the data to be loaded.

    Returns:
    str: A success message if the data is loaded successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function commits the transaction and closes the database connection after loading the data.
    """

    ## load data into the table
    conn = psycopg2.connect("user=" + user + " password=" + password)
    cursor = conn.cursor()
    
    with open(data_path) as f:
        cursor.copy_expert("COPY " + table_name + " FROM STDIN WITH DELIMITER E'\t' CSV HEADER", f)
    
    conn.commit()
    conn.close()

    return "Data loaded successfully.........."
                

In [5]:
## chose the columns to index 

table_name = 'encoded_patents_PatentsView'
index_columns = ['patent_id']

def index_table(table_name , index_columns):

    """
    This function creates an index on each specified column in a table in a PostgreSQL database.

    Parameters:
    table_name (str): The name of the table to be indexed in the PostgreSQL database.
    index_columns (list): A list of column names to create indexes on.

    Returns:
    str: A success message if the table is indexed successfully.

    Note:
    - The function assumes that the `user` and `password` variables are defined elsewhere in the code.
    - The function commits the transaction and closes the database connection after creating the indexes.
    """
    
    #establishing the connection
    conn = psycopg2.connect("user=" + user + " password=" + password)
    
    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()
    
    ## index each column
    for index_column in index_columns:
    
        #Index table as per requirement
        sql ='''CREATE INDEX ''' + index_column + '_' + table_name + ''' ON '''+ table_name +'''(''' + index_column + ''');'''
        cursor.execute(sql)
        
    conn.commit()
    #Closing the connection
    conn.close()
    
    return "Table indexed successfully........"