In [1]:
import xml.etree.ElementTree as ET 
import pandas as pd
import codecs
import os
import zipfile
import math
import numpy as np
import glob
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from joblib import Parallel, delayed



In [2]:
#find all files of the specific type in the folder
def filebrowser(ext=""):
    return [f for f in glob.glob(f"*{ext}")]

In [3]:
os.chdir("C:\\Users\\fjying\\Desktop\\RA_Columbia\\patentparsing\\" + "input_2012")

In [4]:
#07-09
data_list = filebrowser('zip')
data_list

['ad20121231-01.zip',
 'ad20121231-02.zip',
 'ad20121231-03.zip',
 'ad20121231-04.zip',
 'ad20121231-05.zip',
 'ad20121231-06.zip',
 'ad20121231-07.zip',
 'ad20121231-08.zip',
 'ad20121231-09.zip',
 'ad20121231-10.zip',
 'ad20121231-11.zip',
 'ad20121231-12.zip']

In [5]:
zf  = zipfile.ZipFile('ad20121231-01.zip', 'r')
xml_names = zf.namelist()
xmls = [zf.open(name) for name in xml_names]
tree = ET.parse(xmls[0])
root = tree.getroot()
pas = root.findall('patent-assignments/patent-assignment')

In [6]:
def parse_xml(pa):
    # no meaningful patent transfer: jump
    if pa.find('assignment-record/conveyance-text').text != 'ASSIGNMENT OF ASSIGNORS INTEREST (SEE DOCUMENT FOR DETAILS).':
        return

    patent_ids = []
    for patent_property in pa.findall('patent-properties/patent-property'):  
        for documentid in patent_property.findall('document-id'):
            doc_number = documentid.find('doc-number').text
            if len(doc_number) == 7:
                patent_ids.append(doc_number)
    #no patent id: jump
    if len(patent_ids) == 0:
        return
    totaldata = pd.DataFrame()
    totaldata['patent_ids'] = patent_ids
    totaldata['last_update_date'] = pa.find('assignment-record/last-update-date/date').text
    totaldata['reel-no'] = pa.find('assignment-record/reel-no').text
    totaldata['frame-no'] = pa.find('assignment-record/frame-no').text

    #Only one assignor
    #No address for assignor in the record
    totaldata['assignor'] = pa.find('patent-assignors/patent-assignor/name').text

    #multiple assignees: by default 3            
    assignee_count = 0
    for assignee in pa.findall('patent-assignees/patent-assignee'):
        for address in assignee:
            totaldata[address.tag + '_' + str(assignee_count)] = address.text
        assignee_count+=1      
    totaldata['total_number_assignees'] = assignee_count
    return totaldata

In [7]:
with Parallel(n_jobs=-1, verbose=1, prefer="threads") as parallel:
    results = parallel(delayed(parse_xml)(pa) for pa in pas)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 11242 tasks  

In [8]:
total = pd.concat(results, ignore_index = False)

In [9]:
total.to_csv("ad20121231-01.csv", index = False)