This notebook provides the code to count the number of citations recieved by the PPP paper and its control paper over different periods of time. It generates the file "loose_twins_cites_1patent.tsv" as well as "loose_twins_cites_year_by_year.tsv" and "loose_twins_cites_1patent_byyear.tsv".

## Packages

In [1]:
## load packages 
from pySankey.sankey import sankey
import pandas as pd
import json, requests 
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from tqdm import tqdm
import json, requests 
import time
import unicodedata
from metaphone import doublemetaphone
from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
import re
import plotly.express as px
from math import radians, cos, sin, asin, sqrt
import datetime 
from datetime import date
import psycopg2



In [2]:
## import database username and password
main_path = "/home/fs01/spec1142/Emma/PPPs/"

f = open(main_path + "database.txt", "r")
user , password = f.read().split()

In [3]:
## load PPP file 
PPPs = pd.read_csv(main_path + "PPPs_v2.tsv" , delimiter = "\t")

In [5]:
len(PPPs)

548315

## Query citing papers and dates

In [6]:
## query the citing papers of a list of papers (PPPs and twins) and store them

def get_citing_papers(list_papers, workers, i):

    """
    This function retrieves the citing papers of a list of papers from the OpenAlex database and stores them in a dictionary.

    Parameters:
    list_papers (list): A list of paper IDs to retrieve citing papers for.
    workers (int): The number of worker processes to use for parallel processing.
    i (int): The starting index for selecting papers from the list.

    Note:
    - The function assumes that the `user`, `password`, and `conn` variables are defined elsewhere in the code.
    - The function establishes a connection to a PostgreSQL database using the `psycopg2` library and executes SQL queries to fetch the required data.
    - The function stores the citing papers of each paper in a dictionary `dic_citations`.
    - The function returns the `dic_citations` dictionary.
    """

    list_index = [ k for k in range(i,len(list_papers), workers)] 

    #establishing the connection with the database 
    conn = psycopg2.connect("user=" + user + " password=" + password) 
    cursor = conn.cursor()

    dic_citations = {}

    for k in list_index:

        work_id = list_papers[k]

        if pd.isna(work_id) == False: 
            dic_citations[work_id] = []
                
                
            text = """ SELECT  array_agg(work_id)
                       FROM citations_OpenAlex
                       WHERE referenced_work_id = '"""+ work_id + """';"""
        
            cursor.execute(text)
            res = cursor.fetchall()
            if len(res) > 0 and res[0][0] != None :
                dic_citations[work_id] += res[0][0]
        
    
    cursor.close() 

    return dic_citations

In [7]:
## query citing papers of the PPP papers and twins (similar paper)

loose_twins = pd.read_csv(main_path + "PPP_analysis/loose_twins.tsv", sep = "\t")
list_papers = list(set(list(loose_twins['paper_id'])))
print(len(list_papers))


## run the function get_citing_papers using 96 cpus 
from multiprocessing import Pool
from functools import partial

workers = 96

p = Pool(workers)
func = partial(get_citing_papers,list_papers,workers )
results = p.map(func, [ i  for i in range(workers)])
p.close()


## merge results of the threads
dic_citations = {}
for elem in results:
    dic_citations = {**dic_citations,**elem}

## save the results as a dictionary
import json
json = json.dumps(dic_citations)
f = open(main_path + "PPP_analysis/dic_citing_papers_loose_twins.json","w")
f.write(json)
f.close()

535530


In [None]:
## query relevant data (date, institution) of the citing papers. Store the citing paper's date and wheather it's a "self-citation" (citation from the same institution). 

run file "PPP_analysis/get_citing_dates.py"


In [40]:
## merge output files 
dic_citations = {}

for k in tqdm(range(96)):

    f = open(main_path + "PPP_analysis/dic_loose_twins_citations_" + str(k) + ".json" ,"r")
    import json
    dic = json.load(f)
    
    dic_citations = { **dic_citations , **dic } 


## save merged output files
import json
json = json.dumps(dic_citations)
f = open(main_path + "PPP_analysis/dic_loose_twins_citations.json","w")
f.write(json)
f.close()

## Count citations 

In [11]:
## load twins (similar papers) 

loose_twins = pd.read_csv(main_path + "PPP_analysis/loose_twins.tsv", sep = "\t")
loose_twins = loose_twins[loose_twins['patent_id'].notnull()]
loose_twins

Unnamed: 0,paper_id,patent_id,pair_id,twin_score,PPP,PPP_score,paper_date,patent_date,application_id,application_date,published_date
0,W1578801066,US-9786832,0,0.427760,1,1,2015-07-13,2017-10-10,14589182,2015-01-05,2016-07-07
1,W2056326110,US-9786832,0,0.427760,0,1,2015-01-05,2017-10-10,14589182,2015-01-05,2016-07-07
2,W2249867704,US-8778608,1,0.633810,1,2,2006-06-20,2014-07-15,13122553,2009-10-08,2011-10-27
3,W582671889,US-8778608,1,0.633810,0,2,2006-06-20,2014-07-15,13122553,2009-10-08,2011-10-27
4,W2040274814,US-11099263,2,0.427257,1,2,2015-03-17,2021-08-24,16068951,2016-01-11,2019-01-17
...,...,...,...,...,...,...,...,...,...,...,...
1096625,W4242375445,US-4987564,548312,0.199821,0,1,1991-08-01,1991-01-22,7517416,1990-04-26,
1096626,W4245219720,US-4989187,548313,0.199821,1,1,1991-09-01,1991-01-29,7516059,1990-04-26,
1096627,W4242375445,US-4989187,548313,0.199821,0,1,1991-08-01,1991-01-29,7516059,1990-04-26,
1096628,W4245219720,US-5009280,548314,0.199821,1,1,1991-09-01,1991-04-23,7328473,1989-03-24,


In [12]:
## load data on citing papers 

f = open(main_path + "PPP_analysis/dic_loose_twins_citations.json" ,"r")
import json
dic_citations = json.load(f)

### Citations - exact date

In [33]:
## count the number of citations before / after patent's application/publication/grant

from datetime import datetime


def get_flat_file_twins(workers,i):

    """
    This function retrieves citation information for a list of PPP papers and their corresponding twin papers, and stores the information in a flat file format as a pandas DataFrame.

    Parameters:
    workers (int): The number of worker processes to use for parallel processing.
    i (int): The starting index for selecting twin papers from the list.

    Note:
    - The function assumes that the `loose_twins`, `dic_citations'  variables are defined elsewhere in the code.
    - The function calculates the number of citations before and after certain dates (application date, publication date, and grant date) for each PPP paper and twin paper, and stores the information in a dictionary `dic_number_of_cites`.
    - The function also calculates the number of self-citations (citations from papers sharing at least one institution with the focal paper) before and after these dates.
    - The function returns a pandas DataFrame containing the citation information for each PPP paper and twin paper.
    """


    dic_number_of_cites = {}
    k = 0 

    ## load twins 
    data = loose_twins[['paper_id','pair_id','patent_id','paper_date','patent_date','application_date','published_date','PPP','PPP_score']].to_numpy()
    list_index = [ j for j in range(i,len(data), workers)] 

    
    for j in list_index:

        elem = data[j]
        
    
        paper_id, pair_id, patent_id, paper_date,grant_date,application_date,publication_date, PPP , PPP_score = elem

        ## store relevant data 
        dic_number_of_cites[k] = {}
        dic_number_of_cites[k]['paper_id'] = paper_id
        dic_number_of_cites[k]['pair_id'] = pair_id
        dic_number_of_cites[k]['patent_id'] = patent_id
        dic_number_of_cites[k]['PPP'] = PPP
        dic_number_of_cites[k]['PPP_score'] = PPP_score
    
        dic_number_of_cites[k]['application_date'] = application_date
        dic_number_of_cites[k]['publication_date'] = publication_date
        dic_number_of_cites[k]['grant_date'] = grant_date
        dic_number_of_cites[k]['paper_date'] = paper_date
        
    
        ## transform string into dates 
        grant_date = datetime.strptime(grant_date, '%Y-%m-%d')
        application_date = datetime.strptime(application_date, '%Y-%m-%d')
        
        ## initialize counters
        pre_app = 0 
        post_app = 0 
    
        if pd.isna(publication_date ) == False:
            publication_date = datetime.strptime(publication_date, '%Y-%m-%d')
            pre_pub = 0 
            post_pub = 0 
            pre_pub_no_self = 0 
            post_pub_no_self = 0 
        else:
            pre_pub = None
            post_pub = None
            pre_pub_no_self = None
            post_pub_no_self = None
    
        pre_grant = 0 
        post_grant = 0     
        pre_app_no_self = 0 
        post_app_no_self = 0 
        pre_grant_no_self = 0 
        post_grant_no_self = 0 

        ## for each paper, count number of citations 
        if paper_id in dic_citations:
            for citing_paper in dic_citations[paper_id]:
                citing_date = datetime.strptime(dic_citations[paper_id][citing_paper]['date'], '%Y-%m-%d')
                self_cite = dic_citations[paper_id][citing_paper]['self_citation']
    
                if citing_date < application_date:
                    pre_app += 1
                    if self_cite == 0:
                        pre_app_no_self += 1
                else:
                    post_app += 1
                    if self_cite == 0:
                        post_app_no_self += 1
    
    
                if pd.isna(publication_date) ==False:
                    if citing_date < publication_date:
                        pre_pub += 1
                        if self_cite == 0:
                            pre_pub_no_self += 1
                    else:
                        post_pub += 1
                        if self_cite == 0:
                            post_pub_no_self += 1
    
    
                if citing_date < grant_date:
                    pre_grant += 1
                    if self_cite == 0:
                        pre_grant_no_self += 1
                else:
                    post_grant += 1
                    if self_cite == 0:
                        post_grant_no_self += 1
                    

        ## store the counts in the output dictionary 
        dic_number_of_cites[k]['pre_app'] = pre_app    
        dic_number_of_cites[k]['post_app'] = post_app    
        dic_number_of_cites[k]['pre_pub'] = pre_pub    
        dic_number_of_cites[k]['post_pub'] = post_pub    
        dic_number_of_cites[k]['pre_grant'] = pre_grant    
        dic_number_of_cites[k]['post_grant'] = post_grant    
    
        dic_number_of_cites[k]['pre_app_no_self'] = pre_app_no_self 
        dic_number_of_cites[k]['post_app_no_self'] = post_app_no_self    
        dic_number_of_cites[k]['pre_pub_no_self'] = pre_pub_no_self    
        dic_number_of_cites[k]['post_pub_no_self'] = post_pub_no_self    
        dic_number_of_cites[k]['pre_grant_no_self'] = pre_grant_no_self    
        dic_number_of_cites[k]['post_grant_no_self'] = post_grant_no_self  
    
        k += 1

    ## return flat file with citation counts 
    return pd.DataFrame(dic_number_of_cites).T
                    
                    
            

In [12]:
## run the code using 96 cpus

from multiprocessing import Pool
from functools import partial

workers = 24

p = Pool(workers)
func = partial(get_flat_file_twins, workers)
results = p.map(func, [ i  for i in range(workers)])
p.close()

## merge the results of the threads 
df = pd.concat( [ elem for elem in results ] )

In [43]:
## save the data. Note that 1 PPP paper can correspond to multiple patents (continuation)
df.to_csv(main_path + "PPP_analysis/loose_twins_cites_multipatents.tsv", sep = "\t")

In [14]:
## keep only 1 PPP paper per patent. Note that we keep the patent corresponding to the ealiest application date. 

df1 = df[df['PPP'] == 1].sort_values(['paper_id','application_date'])
df1 = df1.drop_duplicates('paper_id', keep='first')
pairids = list(df1['pair_id'])
df = df[df['pair_id'].isin(pairids)]

In [17]:
## save the data. Note that 1 PPP paper corresponds to only 1 patent (ealiest application date) 
df.to_csv(main_path + "PPP_analysis/loose_twins_cites_1patent.tsv", sep = "\t")

### Citations - yearly dates

In [11]:
## count the number of citations before / after patent's application/publication/grant

from datetime import datetime


def get_flat_file_twins_year(workers,i):

    """
    This function retrieves citation information for a list of PPP papers and their corresponding twin papers, and stores the information in a flat file format as a pandas DataFrame.

    Parameters:
    workers (int): The number of worker processes to use for parallel processing.
    i (int): The starting index for selecting twin papers from the list.

    Note:
    - The function assumes that the `loose_twins`, `dic_citations'  variables are defined elsewhere in the code.
    - The function calculates the number of citations before and after certain dates (application date, publication date, and grant date) for each PPP paper and twin paper, and stores the information in a dictionary `dic_number_of_cites`.
    - The function also calculates the number of self-citations (citations from papers sharing at least one institution with the focal paper) before and after these dates.
    - The function returns a pandas DataFrame containing the citation information for each PPP paper and twin paper.
    """


    dic_number_of_cites = {}
    k = 0 

    ## load twins 
    data = loose_twins[['paper_id','pair_id','patent_id','paper_date','patent_date','application_date','published_date','PPP','PPP_score']].to_numpy()
    list_index = [ j for j in range(i,len(data), workers)] 

    
    for j in list_index:

        elem = data[j]
        
    
        paper_id, pair_id, patent_id, paper_date,grant_date,application_date,publication_date, PPP , PPP_score = elem

        ## store relevant data 
        dic_number_of_cites[k] = {}
        dic_number_of_cites[k]['paper_id'] = paper_id
        dic_number_of_cites[k]['pair_id'] = pair_id
        dic_number_of_cites[k]['patent_id'] = patent_id
        dic_number_of_cites[k]['PPP'] = PPP
        dic_number_of_cites[k]['PPP_score'] = PPP_score
    
        dic_number_of_cites[k]['application_date'] = application_date
        dic_number_of_cites[k]['publication_date'] = publication_date
        dic_number_of_cites[k]['grant_date'] = grant_date
        dic_number_of_cites[k]['paper_date'] = paper_date
        
    
        ## transform string into dates 
        grant_date = datetime.strptime(grant_date, '%Y-%m-%d').year
        application_date = datetime.strptime(application_date, '%Y-%m-%d').year
        
        ## initialize counters
        pre_app = 0 
        post_app = 0 
        same_app = 0 
    
        if pd.isna(publication_date ) == False:
            publication_date = datetime.strptime(publication_date, '%Y-%m-%d').year
            
            pre_pub = 0 
            post_pub = 0 
            same_pub = 0 
            
            pre_pub_no_self = 0 
            post_pub_no_self = 0 
            same_pub_no_self = 0 
        else:
            pre_pub = None
            post_pub = None
            same_pub = None
            
            pre_pub_no_self = None
            post_pub_no_self = None
            same_pub_no_self = None
    
        pre_grant = 0 
        post_grant = 0
        same_grant = 0
        
        pre_app_no_self = 0 
        post_app_no_self = 0 
        same_app_no_self = 0 
        
        pre_grant_no_self = 0 
        post_grant_no_self = 0 
        same_grant_no_self = 0

        ## for each paper, count number of citations 
        if paper_id in dic_citations:
            for citing_paper in dic_citations[paper_id]:
                citing_date = datetime.strptime(dic_citations[paper_id][citing_paper]['date'], '%Y-%m-%d').year
                self_cite = dic_citations[paper_id][citing_paper]['self_citation']
    
                if citing_date < application_date:
                    pre_app += 1
                    if self_cite == 0:
                        pre_app_no_self += 1
                        
                elif citing_date > application_date:
                    post_app += 1
                    if self_cite == 0:
                        post_app_no_self += 1
                else:
                    same_app += 1
                    if self_cite == 0:
                        same_app_no_self += 1
                    
    
    
                if pd.isna(publication_date) ==False:
                    if citing_date < publication_date:
                        pre_pub += 1
                        if self_cite == 0:
                            pre_pub_no_self += 1
                    elif citing_date > publication_date:
                        post_pub += 1
                        if self_cite == 0:
                            post_pub_no_self += 1
                    else:
                        same_pub += 1
                        if self_cite == 0:
                            same_pub_no_self += 1
    
    
                if citing_date < grant_date:
                    pre_grant += 1
                    if self_cite == 0:
                        pre_grant_no_self += 1
                elif citing_date > grant_date:
                    post_grant += 1
                    if self_cite == 0:
                        post_grant_no_self += 1
                else:
                    same_grant += 1
                    if self_cite == 0:
                        same_grant_no_self += 1
                    
                    

        ## store the counts in the output dictionary 
        dic_number_of_cites[k]['pre_app'] = pre_app    
        dic_number_of_cites[k]['post_app'] = post_app    
        dic_number_of_cites[k]['same_app'] = same_app    

        
        dic_number_of_cites[k]['pre_pub'] = pre_pub    
        dic_number_of_cites[k]['post_pub'] = post_pub   
        dic_number_of_cites[k]['same_pub'] = same_pub   

        
        dic_number_of_cites[k]['pre_grant'] = pre_grant    
        dic_number_of_cites[k]['post_grant'] = post_grant    
        dic_number_of_cites[k]['same_grant'] = same_grant    
    
        dic_number_of_cites[k]['pre_app_no_self'] = pre_app_no_self 
        dic_number_of_cites[k]['post_app_no_self'] = post_app_no_self 
        dic_number_of_cites[k]['same_app_no_self'] = same_app_no_self

        
        dic_number_of_cites[k]['pre_pub_no_self'] = pre_pub_no_self    
        dic_number_of_cites[k]['post_pub_no_self'] = post_pub_no_self  
        dic_number_of_cites[k]['same_pub_no_self'] = same_pub_no_self  

        
        dic_number_of_cites[k]['pre_grant_no_self'] = pre_grant_no_self    
        dic_number_of_cites[k]['post_grant_no_self'] = post_grant_no_self  
        dic_number_of_cites[k]['same_grant_no_self'] = same_grant_no_self  
    
        k += 1

    ## return flat file with citation counts 
    return pd.DataFrame(dic_number_of_cites).T
                    
                    
            

In [12]:
## run the code using 96 cpus

from multiprocessing import Pool
from functools import partial

workers = 24

p = Pool(workers)
func = partial(get_flat_file_twins_year, workers)
results = p.map(func, [ i  for i in range(workers)])
p.close()

## merge the results of the threads 
df = pd.concat( [ elem for elem in results ] )

In [14]:
## keep only 1 PPP paper per patent. Note that we keep the patent corresponding to the ealiest application date. 

df1 = df[df['PPP'] == 1].sort_values(['paper_id','application_date'])
df1 = df1.drop_duplicates('paper_id', keep='first')
pairids = list(df1['pair_id'])
df = df[df['pair_id'].isin(pairids)]

In [17]:
## save the data. Note that 1 PPP paper corresponds to only 1 patent (ealiest application date) 
df.to_csv(main_path + "PPP_analysis/loose_twins_cites_1patent_byyear.tsv", sep = "\t")

### Count citation by year - Murray Stern replication 

In [42]:
loose_twins[loose_twins['paper_date'].isnull()]

Unnamed: 0,paper_id,patent_id,pair_id,twin_score,PPP,PPP_score,paper_date,patent_date,application_id,application_date,published_date
149,W2520170853,US-11148449,74,0.441811,0,2,,2021-10-19,15580986,2016-06-10,2018-06-28
663,W3099038600,US-8759810,331,0.432271,0,1,,2014-06-24,13497683,2010-09-24,2012-11-01
759,W3101666883,US-10468740,379,0.464631,0,2,,2019-11-05,15553012,2016-02-26,2018-03-08
829,W2018279977,US-6919286,414,0.361772,0,1,,2005-07-19,10308221,2002-11-26,2003-07-10
831,W2018279977,US-6884742,415,0.361772,0,2,,2005-04-26,10306678,2002-11-26,2003-08-14
...,...,...,...,...,...,...,...,...,...,...,...
1096491,,US-9926507,548245,-1.000000,0,4,,2018-03-27,14123580,2012-05-16,2014-07-24
1096493,,US-9580665,548246,-1.000000,0,4,,2017-02-28,14123602,2012-05-16,2014-07-31
1096535,,US-4671491,548267,-1.000000,0,3,,1987-06-09,6743934,1985-06-12,
1096537,,US-4734968,548268,-1.000000,0,3,,1988-04-05,6933190,1986-11-21,


In [44]:
## count the number of citations before / after patent's application/publication/grant

from datetime import datetime


def get_flat_file_twins(workers,i):

    """
    This function creates a flat file with citation counts for each twin in a dataset.

    Parameters:
    - workers (int): The number of worker processes to use for parallel processing.
    - i (int): The index to start processing from.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing the citation counts for each twin.

    Notes:
    The function first loads the twins dataset and selects the columns that are relevant for citation counting. 
    It then creates a list of indices to process, based on the number of worker processes specified.
    For each index in the list, the function creates a dictionary to store the citation counts for each year. 
    It then loops through the citing papers for the current twin, and adds to the citation count for the year corresponding to the citing paper's publication date.
    Finally, the function creates a dictionary to store the data for the current twin, and adds the citation counts for each year to this dictionary. 
    It also stores other relevant data, such as the twin's paper ID, patent
    """


    dic_number_of_cites = {}
    k = 0 

    ## load twins 
    data = loose_twins[loose_twins['paper_date'].notnull()][['paper_id','pair_id','patent_id','paper_date','patent_date','application_date','published_date','PPP','PPP_score']].to_numpy()
    list_index = [ j for j in range(i,len(data), workers)] 

    
    for j in list_index:

        dic_data = {}

        elem = data[j]
        
        paper_id, pair_id, patent_id, paper_date,grant_date,application_date,publication_date, PPP , PPP_score = elem
        
        
        ## transform string into dates 
        paper_date = datetime.strptime(paper_date, '%Y-%m-%d').year
        grant_date = datetime.strptime(grant_date, '%Y-%m-%d').year
        
        if pd.isna(publication_date ) == False:
            publication_date = datetime.strptime(publication_date, '%Y-%m-%d').year
        else:
            publication_date = None

    
        ## for each paper, count number of citations 
        if paper_id in dic_citations:

            for year in range(0 , 10):
                if year not in dic_data:
                    dic_data[year] = {}
                    dic_data[year]['citations'] = 0 
                    dic_data[year]['non_self_citations'] = 0 
                    if publication_date != None:
                        if paper_date + year == publication_date:
                            dic_data[year]['year_of_patent_pub'] = 1
                        else:
                            dic_data[year]['year_of_patent_pub'] = 0
                                
                        if paper_date + year > publication_date:
                            dic_data[year]['postpub'] = 1
                        else:
                            dic_data[year]['postpub'] = 0
                    else:
                        dic_data[year]['postpub'] = None
                        dic_data[year]['year_of_patent_pub'] = None
                            
                            
                    if paper_date + year == grant_date:
                        dic_data[year]['year_of_patent_grant'] = 1
                    else:
                        dic_data[year]['year_of_patent_grant'] = 0
                            
                    if paper_date + year > grant_date:
                        dic_data[year]['postgrant'] = 1
                    else:
                        dic_data[year]['postgrant'] = 0
                
            
            for citing_paper in dic_citations[paper_id]:
                citing_date = datetime.strptime(dic_citations[paper_id][citing_paper]['date'], '%Y-%m-%d').year
                self_cite = dic_citations[paper_id][citing_paper]['self_citation']
                year = citing_date - paper_date
                if year < 10 and year >= 0:
                    dic_data[year]['citations'] += 1
                    if self_cite == 0 :
                        dic_data[year]['non_self_citations'] += 1


        for year in range(10):

            ## store relevant data 
            dic_number_of_cites[k] = {}
            dic_number_of_cites[k]['paper_id'] = paper_id
            dic_number_of_cites[k]['pair_id'] = pair_id
            dic_number_of_cites[k]['patent_id'] = patent_id
            dic_number_of_cites[k]['PPP'] = PPP
            dic_number_of_cites[k]['PPP_score'] = PPP_score
            dic_number_of_cites[k]['paperpubyear'] = paper_date
            dic_number_of_cites[k]['grant_year'] = grant_date
            dic_number_of_cites[k]['pub_year'] = publication_date
            dic_number_of_cites[k]['year'] = year
            dic_number_of_cites[k]['citations'] = dic_data[year]['citations']
            dic_number_of_cites[k]['non_self_citations'] = dic_data[year]['non_self_citations']
            dic_number_of_cites[k]['year_of_patent_grant'] = dic_data[year]['year_of_patent_grant']
            dic_number_of_cites[k]['postgrant'] = dic_data[year]['postgrant']
            dic_number_of_cites[k]['year_of_patent_pub'] = dic_data[year]['year_of_patent_pub']
            dic_number_of_cites[k]['postpub'] = dic_data[year]['postpub']
            k += 1

            

    ## return flat file with citation counts 
    return pd.DataFrame(dic_number_of_cites).T
                    
                    
            

In [45]:
## run the code using 96 cpus

from multiprocessing import Pool
from functools import partial

workers = 24

p = Pool(workers)
func = partial(get_flat_file_twins, workers)
results = p.map(func, [ i  for i in range(workers)])
p.close()



In [88]:
## merge the results of the threads 
df = pd.concat( [ elem for elem in results ] )

In [89]:
df

Unnamed: 0,paper_id,pair_id,patent_id,PPP,PPP_score,paperpubyear,grant_year,pub_year,year,citations,non_self_citations,year_of_patent_grant,postgrant,year_of_patent_pub,postpub
0,W1578801066,0,US-9786832,1,1,2015,2017,2016,0,0,0,0,0,0,0
1,W1578801066,0,US-9786832,1,1,2015,2017,2016,1,0,0,0,0,1,0
2,W1578801066,0,US-9786832,1,1,2015,2017,2016,2,2,2,1,0,0,1
3,W1578801066,0,US-9786832,1,1,2015,2017,2016,3,4,2,0,1,0,1
4,W1578801066,0,US-9786832,1,1,2015,2017,2016,4,2,2,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455115,W871036239,548305,US-10218639,0,2,2014,2019,2015,5,0,0,1,0,0,1
455116,W871036239,548305,US-10218639,0,2,2014,2019,2015,6,0,0,0,1,0,1
455117,W871036239,548305,US-10218639,0,2,2014,2019,2015,7,0,0,0,1,0,1
455118,W871036239,548305,US-10218639,0,2,2014,2019,2015,8,0,0,0,1,0,1


In [91]:
## rename columns and match Murray Stern setup. 

df = df.rename(columns = { "postgrant" : "postgrant_sameforboth" ,  "year_of_patent_grant" : "year_of_patent_grant_sameforboth"  ,  "postpub" : "postpub_sameforboth"  ,  "year_of_patent_pub" : "year_of_patent_pub_sameforboth"  } )

df['postpub'] = df['postpub_sameforboth']
df['postgrant'] = df['postgrant_sameforboth']
df['year_of_patent_pub'] = df['year_of_patent_pub_sameforboth']
df['year_of_patent_grant'] = df['year_of_patent_grant_sameforboth']

df.loc[df['PPP'] == 0, 'postpub'] = 0
df.loc[df['PPP'] == 0, 'postgrant'] = 0
df.loc[df['PPP'] == 0, 'year_of_patent_grant'] = 0
df.loc[df['PPP'] == 0, 'year_of_patent_pub'] = 0

In [103]:
## save data 

df.to_csv(main_path + "PPP_analysis/loose_twins_cites_year_by_year.tsv" , sep = "\t")