# patent data에서 필요한 firm-lv 변수 뽑기

### 1. patent에 gvkey, country, year 붙이기
- gvkey, country: assignee
- year: patent_class

### 2. 변수 제작
- fwd, bwd cit (sum, avg)
- self citation intensity = self / 전체 (fwd, bwd)
- size
- country: firm fixed effect 하면 사라질건데... 일단 구해놓자 중복 많네 귀찮게... 나중에 수작업으로 선별해야
- class로 구한 (normalized) herfindahl index

### 1. patent info - gvkey, country, year
- year 붙이기 (patent_class, patent_info join)  
__=> patent_id, year, info__  
- patent_id -> assignee -> gvkey (patent_assignee join, assignee_gvkey join)   
__=> gvkey, year, info, assignee__
- gvkey, assignee만 분리 후, assignee -> raw_assignee -> country (assignee_raw join, assignee join)
__=> gvkey, country__

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import math

os.chdir('E:/apps/db_table')

__1. patent_id, year, info__

In [2]:
bio = pd.read_csv('patent_class_BIO.csv')
ee = pd.read_csv('patent_class_EE&IT.csv')

bio = bio.loc[:,['patent_id','year']]
ee = ee.loc[:,['patent_id','year']]

df = pd.read_csv('patent_info.csv')

bio = bio.merge(df,how='left',on='patent_id')
ee = ee.merge(df,how='left',on='patent_id')

len(bio), len(ee)

(139224, 850774)

In [3]:
list(bio)

['patent_id',
 'year',
 'num_assignees',
 'num_claims',
 'num_foreign_citations',
 'num_other_citations',
 'num_self_citations',
 'num_times_cited',
 'num_us_citations',
 'num_self_future_citations']

__2. patent_id -> gvkey__

In [4]:
df = pd.read_csv('patent_assignee.csv')

bio = bio.merge(df, how='left', on='patent_id')
ee = ee.merge(df, how='left', on='patent_id')

len(bio), len(ee)

(144911, 871338)

In [5]:
df = pd.read_csv('assignee_gvkey_BIO.csv')
df1 = pd.read_csv('assignee_gvkey_EE&IT.csv')

bio = bio.merge(df, how='left', on='assignee')
ee = ee.merge(df1, how='left', on='assignee')

len(bio), len(ee)

(144974, 871446)

In [6]:
bio.dropna(how='any',inplace=True)  #drop patents which don't have gvkey of assignee
ee.dropna(how='any',inplace=True)

len(bio), len(ee)

(54112, 554481)

In [7]:
bio.to_csv('bio_temp.csv',index=False)
ee.to_csv('ee_temp.csv',index=False)

__3. gvkey -> country__

In [2]:
bio = pd.read_csv('bio_temp.csv')
ee = pd.read_csv('ee_temp.csv')

In [8]:
biocon = bio.loc[:,['gvkey','assignee']]
eecon = ee.loc[:,['gvkey','assignee']]

biocon.drop_duplicates(inplace=True)
eecon.drop_duplicates(inplace=True)

len(biocon), len(eecon)

(681, 621)

In [9]:
df = pd.read_csv('assignee.csv')
df = df.loc[:,['country_code','pdpass']]
df.dropna(how='any',inplace=True)
df.drop_duplicates(inplace=True)

len(df)

120186

In [10]:
pdp = pd.read_csv('pdpass.csv')

df = df.merge(pdp,how='left',on='pdpass')

biocon = biocon.merge(df, how='left', on='assignee')
eecon = eecon.merge(df, how='left', on='assignee')

biocon = biocon.loc[:,['gvkey','country_code']]
eecon = eecon.loc[:,['gvkey','country_code']]

In [11]:
biocon.drop_duplicates(inplace=True)
eecon.drop_duplicates(inplace=True)

len(biocon), len(eecon)

(568, 978)

In [12]:
biocon.to_csv('country_bio.csv',index=False)
eecon.to_csv('country_EE&IT.csv',index=False)

### 2 .변수 제작
- groupby gvkey, year
- size (count)
- claim, fwd, bwd, self, self_fwd cit (avg) => 그 후 self citation intensity = self / 전체 (fwd, bwd)
- Herf index

In [3]:
list(bio)

['patent_id',
 'year',
 'num_assignees',
 'num_claims',
 'num_foreign_citations',
 'num_other_citations',
 'num_self_citations',
 'num_times_cited',
 'num_us_citations',
 'num_self_future_citations',
 'assignee',
 'gvkey']

In [4]:
del bio['assignee']
del ee['assignee']

__1. size__

In [12]:
biosize = bio.loc[:,['gvkey','year']]
eesize = ee.loc[:,['gvkey','year']]

biosize['pat_size'] = ''
eesize['pat_size'] = ''

biosize = biosize.groupby(['gvkey','year'], as_index=False).agg('count')
eesize = eesize.groupby(['gvkey','year'], as_index=False).agg('count')

len(biosize), len(eesize)

(2203, 3174)

In [14]:
biosize.iloc[0:10,:]

Unnamed: 0,gvkey,year,pat_size
0,1078.0,2005,168
1,1078.0,2006,216
2,1078.0,2007,216
3,1078.0,2008,186
4,1078.0,2009,188
5,1078.0,2010,364
6,1078.0,2011,435
7,1078.0,2012,524
8,1602.0,2005,46
9,1602.0,2006,67


__2. citation__

In [5]:
bio = bio.rename(columns = {'num_us_citations':'bwd','num_times_cited':'fwd','num_self_citations':'self',
                            'num_self_future_citations':'self_fwd'})
ee = ee.rename(columns = {'num_us_citations':'bwd','num_times_cited':'fwd','num_self_citations':'self',
                            'num_self_future_citations':'self_fwd'})

biocit = bio.loc[:,['gvkey','year','num_claims','bwd','fwd','self','self_fwd']]
eecit = ee.loc[:,['gvkey','year','num_claims','bwd','fwd','self','self_fwd']]

In [6]:
biocit = biocit.groupby(['gvkey','year'], as_index=False).mean()
eecit = eecit.groupby(['gvkey','year'], as_index=False).mean()

In [24]:
biocit['self_ratio'] = biocit['self'] / biocit['bwd']
biocit['self_ratio_fwd'] = biocit['self_fwd'] / biocit['fwd']

eecit['self_ratio'] = eecit['self'] / eecit['bwd']
eecit['self_ratio_fwd'] = eecit['self_fwd'] / eecit['fwd']

len(biocit), len(eecit)

(2203, 3174)

In [8]:
biocit.iloc[0:10,:]

Unnamed: 0,gvkey,year,num_claims,bwd,fwd,self,self_fwd,self_ratio,self_ratio_fwd
0,1078.0,2005,22.922619,31.702381,44.60119,3.630952,4.035714,0.114532,0.090484
1,1078.0,2006,20.972222,36.62963,37.861111,5.125,6.689815,0.139914,0.176694
2,1078.0,2007,18.611111,53.328704,33.375,6.032407,4.916667,0.113117,0.147316
3,1078.0,2008,16.419355,59.709677,20.553763,6.043011,2.532258,0.101207,0.123202
4,1078.0,2009,16.462766,89.202128,17.255319,7.494681,3.292553,0.084019,0.190814
5,1078.0,2010,17.230769,116.706044,12.027473,5.763736,2.211538,0.049387,0.183874
6,1078.0,2011,18.671264,139.457471,6.963218,4.749425,1.098851,0.034056,0.157808
7,1078.0,2012,18.034351,164.366412,4.740458,3.931298,0.687023,0.023918,0.144928
8,1602.0,2005,24.978261,8.391304,9.630435,0.782609,0.826087,0.093264,0.085779
9,1602.0,2006,21.701493,12.0,7.447761,0.686567,0.910448,0.057214,0.122244


In [25]:
bio = biosize.merge(biocit,how='left',on=['gvkey','year'])
ee = eesize.merge(eecit,how='left',on=['gvkey','year'])

len(bio), len(ee)

(2203, 3174)

In [29]:
bio1 = bio.drop(['self_ratio','self_ratio_fwd'], axis=1)
ee1 = ee.drop(['self_ratio','self_ratio_fwd'], axis=1)

bio1.to_csv('E:/apps/feature/patent_bio.csv',index=False)
ee1.to_csv('E:/apps/feature/patent_EE&IT.csv',index=False)

In [30]:
bio.isnull().any()

gvkey             False
year              False
pat_size          False
num_claims        False
bwd               False
fwd               False
self              False
self_fwd          False
self_ratio         True
self_ratio_fwd     True
dtype: bool

In [31]:
bio.dropna(how='any',inplace=True)
ee.dropna(how='any',inplace=True)

len(bio), len(ee)

(2109, 3169)

In [33]:
bio.to_csv('E:/apps/feature/patent_self_bio.csv',index=False)
ee.to_csv('E:/apps/feature/patent_self_EE&IT.csv',index=False)

__3. herfindhal index__

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import math

os.chdir('E:/apps/appcit measure')

In [2]:
header = ['gvkey','year','herf_cls']

def get_herf(ind,y_start,y_end):
    fname = 'E:/apps/feature/cls_herf_' + ind + '.csv'
    with open(fname,'wb') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        
        for y in range(y_start,1+y_end):   
            cls = 'class_vector_' + ind + '_' + str(y) +'.csv'
            df = pd.read_csv(cls)            
            
            for i in range(0,len(df)):
                row = []
                row.append(df.iloc[i,0])
                row.append(y)
                herf = round(sum([x**2 for x in df.iloc[i,1:]]),3)
                row.append(herf)
                
                writer.writerow(row)

In [3]:
get_herf('bio',2005,2012)
get_herf('EE&IT',2005,2012)