# USPTO의 XML data 처리

1. USPTO는 XML 형태, 매주 목요일 기준으로 application 데이터 공개
2. App NO, DN, Assignee, Class, Filed date, Country 정보 추출 가능
3. 2002 - 2004, 2005 ~ 2013 apps 정보 추출 ㄱㄱ

### 1. xml 파일 처리

- USPTO가 제공하는 xml은 multiroot 
- 각 app이 하나의 xml, 그걸 그냥 모아놓은 거
- __따라서 app 1개씩 잘라야 xml parsing이 가능__
- https://github.com/funginstitute/patentprocessor/blob/master/parse.py 참고

In [1]:
#-*- coding: utf-8 -*-
import os
import re
import csv
import sys
import xml.etree.ElementTree as et
import datetime as dt

reload(sys)
sys.setdefaultencoding('utf-8')  # to process name of assignee in unicode 
os.chdir('E:/apps')

In [2]:
def extract_xml_strings(filename):
    # search for terminating XML tag
    endtag_regex = re.compile('^<!DOCTYPE (.*) SYSTEM')
    endtag = ''
    with open(filename, 'r') as f:
        doc = ''  # initialize current XML doc to empty string
        for line in f:
            doc += line
            endtag = endtag_regex.findall(line) if not endtag else endtag   #is there any endtags?
            if not endtag:
                continue      #if there's no end tag, keep the loop going
            terminate = re.compile('^</{0}>'.format(endtag[0]))    #if there's a end tag, then check whether the doc is finished or not
            if terminate.findall(line):     #if there's a terminator
                yield (doc)                 #export doc and reinitialize a doc
                endtag = ''
                doc = ''

### 2. csv 파일 만들기
- csv 2개 만들 거
- dn, app no, assignee, country, date 담은 csv  
- dn, app no, class 담은 csv
- 각 정보 list에 담고 그걸 line으로 해서 csv 쓰자
- 문제는 02 ~ 04와 05 ~ 13 xml 구조가 다름  
    => 02 ~ 04 csv 만든 후, 그 밑에 05 ~ 13 붙이자

__code 구조__
- filename 제작
- xml 불러와서 쪼개기
- 쪼갠 xml parsing해서 원하는 정보 뽑기  
    dn, appno, date : app 당 1개 뿐 -> 찍어서 뽑기   
    assignee, country, class : 여러 개 가능 -> iter 사용
- list에 담고 writing csv
- date +1주

__xml 뽑을 element (02 ~ 04)__
- DN : ~ / doc-number 첫 번째
- APP No : ~ / doc-number 두 번째
- Filed date (02 ~ 04) : subdoc-bibliographic-information / domestic-filing-data / filing-date    
- Class (02 ~ 04) : ~ / uspc / class, subclass  
    **issue : 3digit에서 0 제거 안 함  __ex) 312/29.1 = 312029001 => classmaker fn 사용__    
- Assignee : assignee / organization-name   
- Country : assignee / country / country-code

__xml 뽑을 element (05 ~ 13)__
- DN : ~ / doc-number 첫 번째
- APP No : ~ / doc-number 두 번째
- Filed date (05 ~ 13) : us-bibliographic-data-application / application-reference / documnet-id / date
- Class : us-bibliographic-data-application / classification-national / main-classification  
    **issue : 3digit에서 0 or 공백 제거 안 함  __ex) 312/29.1 = 312029001 or 312 291  => classmaker fn 사용__    
- Assignee : us-bibliographic-data-application / assignees / addressbook / orgname   
- Country : us-bibliographic-data-application / assignees / addressbook / address / country

In [3]:
def digitize(cls):
    try :
        return str(int(cls))
    except ValueError:
        if cls[0] == '0':
            cls=cls.replace('0','')
        return cls

In [10]:
def classmaker(cls,line):
    try:
        cls=cls.replace(' ','0')
        if len(cls) < 9:
            for i in range(9-len(cls)):
                cls += '0'

        line.append(digitize(cls[0:3]))
        if digitize(cls[6:9])=='0':
            line.append(digitize(cls[3:6]))
        else:
            sub = (lambda x,y: x + '.' + y)(digitize(cls[3:6]),digitize(cls[6:9]))
            line.append(sub)
    except AttributeError:      # due to missing value
        pass

2002 ~ 2004는 html 섞여있나 이상한 entity 있음 -> entity 에러 제거 위해 parser 처리 필요

In [5]:
class AllEntities:
    def __getitem__(self, key):
        #key is your entity, you can do whatever you want with it here
        return key

In [6]:
date = '20020103'  #initializing date

with open('app_profile.csv', 'wb') as prf, open('app_class.csv', 'wb') as cls:
    prf_writer = csv.writer(prf,quoting=csv.QUOTE_ALL)
    cls_writer = csv.writer(cls,quoting=csv.QUOTE_ALL)
    
    prf_writer.writerow(['number','id','date','assignee','country'])
    cls_writer.writerow(['number','id','mainclass','subclass','further_m','further_s'])
    
    while not date=='20050106':
        filename = 'xmls/ipab'+ date + '.xml'   #making filename
        xml = extract_xml_strings(filename)   #load and split whole xml into app_xmls  
        for doc in xml:
            parser = et.XMLParser()
            parser.parser.UseForeignDTD(True)
            parser.entity = AllEntities()
            root = et.fromstring(doc, parser=parser)
            n = root.iter('doc-number')
            dn = n.next().text
            appno = (lambda x: x[0:2] + '/' + x[2:8])(n.next().text)
            
            n = root.iter('filing-date')
            fdate = (lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8])(n.next().text)
            
            aline = [dn,appno,fdate]
            cline = [dn,appno]
            
            for ass in root.iter('assignee'):   #using iter because of multi-assignee apps 
                for n in ass.iter("organization-name"):    #to deal with name-missed assignees
                    name = n.text
                    aline.append(name)
                
                for n in ass.iter("country-code"):    #to deal with country-missed assignees
                    country = n.text
                    aline.append(country)    
                
            aline.append('private') if len(aline)==3 else aline
            
            for cls in root.iter('uspc'):  #using iter because of multi-class apps
                rawclass = cls[0].text + cls[1].text
                classmaker(rawclass,cline)           
                          
            prf_writer.writerow(aline)
            cls_writer.writerow(cline)
        
        date = dt.datetime.strptime(date,'%Y%m%d')   #Let's move on to next xml
        date += dt.timedelta(weeks=1)
        date = date.strftime('%Y%m%d')            

** issue: country code에 이상한 거 많음 (기업 이름, 공백 포함 - US vs ' 'US) **

2005 ~ 2013

In [7]:
date = '20050106'  #initializing date

with open('app_profile.csv', 'ab') as prf, open('app_class.csv', 'ab') as cls:
    prf_writer = csv.writer(prf,quoting=csv.QUOTE_ALL)
    cls_writer = csv.writer(cls,quoting=csv.QUOTE_ALL)
       
    while not date=='20140102':
        filename = 'xmls/ipab'+ date + '.xml'   #making filename
        xml = extract_xml_strings(filename)   #load and split whole xml into app_xmls  
        for doc in xml:
            root = et.fromstring(doc)
            n = root.iter('doc-number')
            dn = n.next().text
            appno = (lambda x: x[0:2] + '/' + x[2:8])(n.next().text)
            fdate = (lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8])(root[0][1][0][2].text)
                
            aline = [dn,appno,fdate]
            cline = [dn,appno]
            
            for ass in root.iter('assignee'):   #using iter because of multi-assignee apps 
                for n in ass.iter("orgname"):    #to deal with name-missed assignees
                    name = n.text
                    aline.append(name)
                
                for n in ass.iter("country"):    #to deal with country-missed assignees
                    country = n.text
                    aline.append(country) 
                
            aline.append('private') if len(aline)==3 else aline
            
            for cls in root.iter('main-classification'):  #using iter because of multi-class apps
                rawclass = cls.text
                classmaker(rawclass,cline)
            
            for cls in root.iter('further-classification'):  #using iter because of multi-class apps
                rawclass = cls.text
                classmaker(rawclass,cline)
                          
            prf_writer.writerow(aline)
            cls_writer.writerow(cline)
        
        date = dt.datetime.strptime(date,'%Y%m%d')   #Let's move on to next xml
        date += dt.timedelta(weeks=1)
        date = date.strftime('%Y%m%d')            

AttributeError: 'NoneType' object has no attribute 'replace'

와 나 class에도 missing이 있네? ㅁㅊ... classmaker 함수 수정 후 다시 시작

In [8]:
date

'20080221'

In [9]:
dn

'20080044287'

In [None]:
date = '20080221'  #initializing date

with open('app_profile.csv', 'ab') as prf, open('app_class.csv', 'ab') as cls:
    prf_writer = csv.writer(prf,quoting=csv.QUOTE_ALL)
    cls_writer = csv.writer(cls,quoting=csv.QUOTE_ALL)
       
    while not date=='20140102':
        filename = 'xmls/ipab'+ date + '.xml'   #making filename
        xml = extract_xml_strings(filename)   #load and split whole xml into app_xmls  
        for doc in xml:
            root = et.fromstring(doc)
            n = root.iter('doc-number')
            dn = n.next().text
            appno = (lambda x: x[0:2] + '/' + x[2:8])(n.next().text)
            fdate = (lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8])(root[0][1][0][2].text)
                
            aline = [dn,appno,fdate]
            cline = [dn,appno]
            
            for ass in root.iter('assignee'):   #using iter because of multi-assignee apps 
                for n in ass.iter("orgname"):    #to deal with name-missed assignees
                    name = n.text
                    aline.append(name)
                
                for n in ass.iter("country"):    #to deal with country-missed assignees
                    country = n.text
                    aline.append(country) 
                
            aline.append('private') if len(aline)==3 else aline
            
            for cls in root.iter('main-classification'):  #using iter because of multi-class apps
                rawclass = cls.text
                classmaker(rawclass,cline)
            
            for cls in root.iter('further-classification'):  #using iter because of multi-class apps
                rawclass = cls.text
                classmaker(rawclass,cline)
                          
            prf_writer.writerow(aline)
            cls_writer.writerow(cline)
        
        date = dt.datetime.strptime(date,'%Y%m%d')   #Let's move on to next xml
        date += dt.timedelta(weeks=1)
        date = date.strftime('%Y%m%d')            