## 专利引文替换

In [2]:
import pandas as pd

### 读取PNA数据

In [3]:
%%time
PNA = pd.read_excel('./dataset/PNA.xlsx')
PNA.head(5)

CPU times: user 661 ms, sys: 65.2 ms, total: 726 ms
Wall time: 835 ms


Unnamed: 0,PN
0,US2011063416-A1; EP2299491-A2; FR2950196-A1;...
1,US2010117799-A1
2,JP2011002997-A
3,JP2010246671-A
4,WO2010082246-A1; JP2010165127-A; US201022655...


### 抽取PNA字典

In [4]:
# @desc 将数据映射为字典O(1)
# @desc 定义映射引擎: 
# @notice key-value 别名：真名
# @return 返回两个字典：real真名字典，alias小名字典
def mapping_engine(df):
    alias_dict = {}
    real_dict = {}
    # 遍历PNA
    for i in range(len(df)):
        # 取出当前行第一个是real，后面是alias
        row = df.iloc[i, 0]
        info = str(row).split(';')
        real = info[0].split('-')[0]
        alias = info[1:]
        for j in range(len(alias)):
            alias_dict[alias[j].strip(' ').split('-')[0]] = real
            
        real_dict[real] = real
        
    return real_dict, alias_dict

In [5]:
%%time
real_dict, alias_dict = mapping_engine(PNA) # 调用生成两个字典

CPU times: user 715 ms, sys: 22.5 ms, total: 738 ms
Wall time: 790 ms


In [6]:
real_dict

{'US2011063416': 'US2011063416',
 'US2010117799': 'US2010117799',
 'JP2011002997': 'JP2011002997',
 'JP2010246671': 'JP2010246671',
 'WO2010082246': 'WO2010082246',
 'JP2011110063': 'JP2011110063',
 'JP2011000185': 'JP2011000185',
 'US2009318779': 'US2009318779',
 'WO2009137682': 'WO2009137682',
 'US2010268543': 'US2010268543',
 'US2010185711': 'US2010185711',
 'US2010070249': 'US2010070249',
 'WO2010024907': 'WO2010024907',
 'WO2009099707': 'WO2009099707',
 'US2011067092': 'US2011067092',
 'JP2010176371': 'JP2010176371',
 'JP2010240097': 'JP2010240097',
 'WO2009108802': 'WO2009108802',
 'WO2011051103': 'WO2011051103',
 'US2010324935': 'US2010324935',
 'HU200900572': 'HU200900572',
 'US2010094655': 'US2010094655',
 'TW388699': 'TW388699',
 'WO2009156934': 'WO2009156934',
 'US2009198513': 'US2009198513',
 'US2010063382': 'US2010063382',
 'WO2009133197': 'WO2009133197',
 'WO2009132058': 'WO2009132058',
 'WO2010045463': 'WO2010045463',
 'WO2009120877': 'WO2009120877',
 'WO2009140757': 'WO

In [7]:
alias_dict

{'EP2299491': 'US2011063416',
 'FR2950196': 'US2011063416',
 'JP2011097575': 'US2011063416',
 'CN102024833': 'US2011063416',
 'US8416283': 'US2011063416',
 'JP2010165127': 'WO2010082246',
 'US2010226550': 'WO2010082246',
 'US8160333': 'WO2010082246',
 'US2012250959': 'WO2010082246',
 'JP2013198817': 'WO2010082246',
 'JP5317716': 'WO2010082246',
 'JP5677521': 'WO2010082246',
 'US8958612': 'WO2010082246',
 'JP5527802': 'JP2011110063',
 'JP5538756': 'JP2011000185',
 'US8108036': 'US2009318779',
 'US2009281838': 'WO2009137682',
 'US2009281839': 'WO2009137682',
 'AU2009244200': 'WO2009137682',
 'EP2283443': 'WO2009137682',
 'CA2722773': 'WO2009137682',
 'JP2011524037': 'WO2009137682',
 'US2013209068': 'WO2009137682',
 'US2013218600': 'WO2009137682',
 'US2013268291': 'WO2009137682',
 'US2013290011': 'WO2009137682',
 'JP5474937': 'WO2009137682',
 'JP2014075154': 'WO2009137682',
 'US2015066533': 'WO2009137682',
 'US9053222': 'WO2009137682',
 'US2015227713': 'WO2009137682',
 'US2016302704': 'WO

### 读取PNB数据

In [8]:
%%time
PNB = pd.read_csv('./dataset/PNB.csv')
PNB.head(5)

CPU times: user 1.17 s, sys: 185 ms, total: 1.35 s
Wall time: 1.58 s


Unnamed: 0.1,Unnamed: 0,ID,From,To
0,1,1,FR2301548,EP426191
1,2,1,JP01272886,EP426191
2,3,1,JP58040306,EP426191
3,4,1,US3869429,EP426191
4,5,1,US3475771,EP426191


### 替换操作开始

In [44]:
# @desc 比较并替换数据PNB数据
# @desc 置换引擎：
# @notice 根据上送的字典做替换操作
# @return 返回生成的dataframe
def CAX_engine(df, real_dict, alias_dict):
    # 遍历csv中每条数据
    df_res = pd.DataFrame()
    res = []
    to = []
    for i in range(len(df)):
        pre = df.iloc[i, 2] # from
        # 查字典
        tmp = []
        if alias_dict.__contains__(pre):
            tmp.append(alias_dict.get(pre))
        elif real_dict.__contains__(pre):
            tmp.append(real_dict.get(pre))
        if len(tmp) > 0: # 匹配上了就存下new和to
            res.append(tmp)
            to.append(df.iloc[i, 3])

    df_res['From'] = pd.Series(res)
    df_res['To'] = pd.Series(to)
    return df_res

In [45]:
%%time
df_res = CAX_engine(PNB, real_dict, alias_dict)

CPU times: user 47.9 s, sys: 666 ms, total: 48.5 s
Wall time: 49.4 s


In [47]:
df_res

Unnamed: 0,From,To
0,[JP6213941],JP10288633
1,[CN102509010],CN106971084
2,[US2011087438],CN203208031
3,[US2010063382],CN203288575
4,[CN203802580],CN203555838
...,...,...
32527,[CN106073062],CN110916716
32528,[CN205992918],CN111076762
32529,[CN108538342],CN110855408
32530,[CN107578815],CN111899863


### 生成结果文件

In [49]:
# 传入df和文件名生成csv文件
def save_file(df, file_name):
    df.to_csv('./result/' + file_name + 'result.csv', index=False)

In [52]:
%%time
save_file(df_res, 'df_')

CPU times: user 96.5 ms, sys: 14.2 ms, total: 111 ms
Wall time: 128 ms
