# Combining patent data with KCMO Business Licenses

In [210]:
## import Python packages ##
import time # to convert time as needed and report how long some functions take

# interacting with websites and web-APIs
import requests # easy way to interact with web sites and services
import json # read/write JavaScript Object Notation (JSON)
from bs4 import BeautifulSoup

# data manipulation
import pandas as pd # easy data manipulation
from pandas import DataFrame
# import geopandas as gpd # geographic data manipulation
# from geopandas.tools import sjoin, overlay # spatial join and overlay functions
# from shapely.geometry import Point, LineString # to create lines from a list of points

# visualization
import matplotlib as mplib
import matplotlib.pyplot as plt # visualization package

import math
import os
import numpy as np 

# so images get plotted in the notebook
%matplotlib inline

import time

## Merge patent data and KCMO business licenses on Business Name

In [87]:
patent_data = pd.read_csv('../../data/USPTO/patent_data.csv')
kcmo = pd.read_csv('../../data/KCMO/BusinessLicense2013_2018NYU_01222018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


___Drop duplicate data for KMCO Business Licenses___

In [88]:
kcmo = kcmo.drop_duplicates()
kcmo.describe(include = 'all')

Unnamed: 0,fstrBusinessActivity,fstrAddress,fstrlegalName,fstrDBAName,fdtmfilingPeriod
count,99345,99345,99345,33719,99345
unique,626,30654,33593,10351,6
top,Commercial and Institutional Building Construc...,2405 GRAND BLVD STE 1020 KANSAS CITY MO 64108-...,REDBOX AUTOMATED RETAIL LLC,CASH AMERICA PAWN OF KANSAS CITY,12/31/17
freq,19273,139,313,70,23873


___ Drop patents prior to 1995, keep only utilities patents?___

In [89]:
patent_data = patent_data[pd.DatetimeIndex((pd.to_datetime(patent_data['date'], errors = 'coerce'))).year >= 2003]
patent_data = patent_data[patent_data['patent_id'].str[:1].str.isnumeric()]
patent_data.describe(include = 'all')

Unnamed: 0,id,patent_id,type,number,country,date,assignee_id,name_first,name_last,organization
count,2839964,2839964,2839964,2839964,2839964,2839964,2606010,19122,19122,2563263
unique,2735542,2740804,12,2735564,1,5251,168511,8679,8672,153413
top,2014/14553554,9657092,12,14114995,US,2013-03-15,29a03fd21a4c9b1420a55ecba2105eae,Michael,Lee,International Business Machines Corporation
freq,15,15,657111,15,2839964,8449,67147,148,261,67147


In [90]:
patent_data['flag'] = patent_data['patent_id'].str[:1].str.isnumeric()

___ RegEx the business names for matching ___

In [91]:
patent_data['organization_clean'] = patent_data['organization'].str.upper()
kcmo['business_name_clean'] = kcmo['fstrlegalName'].str.upper()

# remove .
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\.', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\.', '')


# remove .
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\,', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\,', '')


# replace ' by \s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\'', ' ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\'', ' ')

# replace - by \s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('-', ' ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('-', ' ')

# Replace & by \s&\s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\s?&\s?', ' & ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\s?&\s?', ' & ')

# replace Company by CO
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('COMPANY', 'CO')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('COMPANY', 'CO')

# replace National Association by NA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('NATIONAL ASSOCIATION', 'NA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('NATIONAL ASSOCIATION', 'NA')

# replace N A by NA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bN\sA\b', 'NA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bN\sA\b', 'NA')

# replace U S A by USA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bU\sS\sA\b', 'USA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bU\sS\sA\b', 'USA')

# replace U S by US
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bU\sS\b', 'US')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bU\sS\b', 'US')


# When THE is last word, move to front
# TEMP: Remove all THE
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'^THE\b', '')
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bTHE$', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'^THE\b', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bTHE$', '')

#More radical: remove all special characters:
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'[^A-Z\s0-9\&]', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'[^A-Z\s0-9\&]', '')

#Most radical: remove all non-alpha characters
#patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'[^A-Z]', '')
#kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'[^A-Z]', '')

# Clean
patent_data['organization_clean'] = patent_data['organization_clean'].str.strip()
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.strip()

In [92]:
#string = "THE"
#kcmo[kcmo['Business Name'].str.contains(string)]
#patent_data[patent_data['organization'].str.contains(string)]

___ Merge datasets ___

In [93]:
kcmo_patents = pd.merge(kcmo, patent_data, how = 'left'
                          , left_on = 'business_name_clean'
                          , right_on = 'organization_clean')

In [94]:
kcmo_patents.describe(include = 'all')

Unnamed: 0,fstrBusinessActivity,fstrAddress,fstrlegalName,fstrDBAName,fdtmfilingPeriod,business_name_clean,id,patent_id,type,number,country,date,assignee_id,name_first,name_last,organization,flag,organization_clean
count,702398,702398,702398,77678,702398,702398,605519,605519,605519,605519,605519,605519,605519,0.0,0.0,605519,605519,605519
unique,626,30654,33593,10351,6,33589,92803,92901,7,92803,1,3924,409,0.0,0.0,409,1,402
top,Consumer Electronics Repair and Maintenance,1 MICROSOFT WAY REDMOND WA 98052-8300,MICROSOFT CORPORATION,SPRINT PCS,12/31/16,MICROSOFT CORPORATION,2014/14310674,7787987,11,10824781,US,2006-04-21,237c2b0099548ddbfa5a37f07e0687ab,,,Microsoft Corporation,True,MICROSOFT CORPORATION
freq,130162,100395,100395,7185,151310,100395,313,313,170623,313,605519,1807,100395,,,100395,605519,100395


## API for Patent Descriptions

___ Generate list of relevant patents___

In [247]:
patents = kcmo_patents[kcmo_patents['patent_id'].isnull()==False]['patent_id'].drop_duplicates().tolist()
print(len(patents))

In [248]:
# patents = patents[:500]

___ Query PatentsView API (one patent at a time)___

In [249]:
# df = pd.DataFrame("", index=np.arange(0), columns=['patent_abstract', 'patent_number', 'patent_title'])

# start_time = time.time()
# for patent in patents:
#     base_url = 'http://www.patentsview.org/api/patents/query?'
#     query = 'q={"patent_number":"'+ patent + '"}'
#     fields='&f=["patent_number", "patent_title", "patent_abstract"]'
#     sort= '' #&s=
#     options= '&o={"page":1,"per_page": 500}'
#     url = '{}{}{}{}{}'.format(base_url, query, fields, sort, options)
    
#     response = requests.get(url)
#     response = json.loads(response.text)
#     temp = DataFrame.from_dict(response['patents'])
#     df = pd.concat([df, temp])

# df = df.reset_index(drop = True)
# print("--- %s seconds ---" % (time.time() - start_time))

In [250]:
# df

___ Query PatentsView API (200 patents at a time)___

In [251]:
df = pd.DataFrame("", index=np.arange(0), columns=['patent_abstract', 'patent_number', 'patent_title'])
start_time = time.time()
for i in range(0, int(math.modf(len(patents)/200)[1]+1)):
    sub_patents = patents[200*i:min(200*(i+1), len(patents))]
    base_url = 'http://www.patentsview.org/api/patents/query?'
    query = 'q={"_or":[{"patent_number":"'+ sub_patents[0] + '"}'
    for j in range(1, len(sub_patents)):
        query = query + ',{"patent_number":"'+sub_patents[j]+'"}'
    query = query + ']}'
    fields='&f=["patent_number", "patent_title", "patent_abstract"]'
    sort= '' #&s=
    options= '&o={"page":1,"per_page": 500}'
    url = '{}{}{}{}{}'.format(base_url, query, fields, sort, options)
    
    response = requests.get(url)
    response = json.loads(response.text)
    temp = DataFrame.from_dict(response['patents'])
    df = pd.concat([df, temp])

df = df.reset_index(drop = True)
print("--- %s seconds ---" % (time.time() - start_time))

--- 281.40145778656006 seconds ---


In [252]:
df

Unnamed: 0,patent_abstract,patent_number,patent_title
0,An abrading composition and a process for abra...,6663467,Process and composition for abrading pre-finis...
1,A fan deck display having a case including a c...,6665965,Ergonomic color sample fan deck
2,A leak detection system for the detection and ...,6772598,Refrigerant leak detection system
3,A system and method for method for measuring t...,6826956,Differential pressure level control
4,The present invention provides a method of ide...,6861229,Method of identifying a gene product
5,A tool extension assembly including an extensi...,6874201,Tool extension assembly with quick release loc...
6,An aerosol archival product that includes an a...,6890455,Archival spray composition
7,"A method of finishing a wood product, such as ...",6892105,Method of producing a wood finish at a locatio...
8,A square paint container having recesses forme...,6896156,Plastic paint container having a cube-shaped body
9,The inventive method of producing a eukaryotic...,6908762,Method of preparing a eukaryotic viral vector


#############################
#############################
#############################

In [256]:
import re
re.sub('<[^>]*>', '', DataFrame.from_dict(response['patents'])['patent_abstract'][596])

'"A system method that adapts wireless link parameters for a wireless communication link. A measure is determined of errors signal quality occurring in communication over a wireless link. In a case that the measure of errors signal quality corresponds to more errors than a first predetermined threshold, communication changes from a first set of wireless link parameters to a second set of wireless link parameters. The second set of wireless link parameters corresponds to higher error tolerance than the first set of wireless link parameters. In a case that the measure of errors signal quality corresponds to fewer errors than a second predetermined threshold, communication changes from the first set of wireless link parameters to a third set of wireless link parameters. The third set of wireless link parameters corresponds to lower error tolerance than the first set of wireless link parameters. Preferably, the measure of errors signal quality is determined by monitoring a number of NACK m