# Combining patent data with KCMO Business Licenses

In [2]:
## import Python packages ##
import time # to convert time as needed and report how long some functions take

# interacting with websites and web-APIs
import requests # easy way to interact with web sites and services
import json # read/write JavaScript Object Notation (JSON)
from bs4 import BeautifulSoup

# data manipulation
import pandas as pd # easy data manipulation
from pandas import DataFrame
# import geopandas as gpd # geographic data manipulation
# from geopandas.tools import sjoin, overlay # spatial join and overlay functions
# from shapely.geometry import Point, LineString # to create lines from a list of points

# visualization
import matplotlib as mplib
import matplotlib.pyplot as plt # visualization package

import math
import os

# so images get plotted in the notebook
%matplotlib inline

## Merge patent data and KCMO business licenses on Business Name

In [6]:
patent_data = pd.read_csv('../../data/USPTO/patent_data.csv')
kcmo = pd.read_csv('../../data/KCMO/BusinessLicense2013_2018NYU_01222018.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
kcmo = kcmo.drop_duplicates()
kcmo.describe(include = 'all')

Unnamed: 0,fstrBusinessActivity,fstrAddress,fstrlegalName,fstrDBAName,fdtmfilingPeriod
count,99345,99345,99345,33719,99345
unique,626,30654,33593,10351,6
top,Commercial and Institutional Building Construc...,2405 GRAND BLVD STE 1020 KANSAS CITY MO 64108-...,REDBOX AUTOMATED RETAIL LLC,CASH AMERICA PAWN OF KANSAS CITY,12/31/17
freq,19273,139,313,70,23873


In [8]:
patent_data.describe(include = 'all')

Unnamed: 0,id,patent_id,type,number,country,date,assignee_id,name_first,name_last,organization
count,6599785,6599785,6599785,6599785,6599785,6599785,5574600,62127,62128,5466526
unique,6204979,6422962,28,6204890,1,16720,374807,19936,22516,332498
top,8,4185912,9,8,US,1995-06-07,29a03fd21a4c9b1420a55ecba2105eae,Michael,Lee,International Business Machines Corporation
freq,131072,35,733624,151684,6599785,10537,110452,433,559,110452


In [10]:
patent_data['organization_clean'] = patent_data['organization'].str.upper()
kcmo['business_name_clean'] = kcmo['fstrlegalName'].str.upper()

# remove .
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\.', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\.', '')


# remove .
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\,', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\,', '')


# replace ' by \s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('\'', ' ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('\'', ' ')

# replace - by \s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('-', ' ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('-', ' ')

# Replace & by \s&\s
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\s?&\s?', ' & ')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\s?&\s?', ' & ')

# replace Company by CO
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('COMPANY', 'CO')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('COMPANY', 'CO')

# replace National Association by NA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace('NATIONAL ASSOCIATION', 'NA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace('NATIONAL ASSOCIATION', 'NA')

# replace N A by NA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bN\sA\b', 'NA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bN\sA\b', 'NA')

# replace U S A by USA
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bU\sS\sA\b', 'USA')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bU\sS\sA\b', 'USA')

# replace U S by US
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bU\sS\b', 'US')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bU\sS\b', 'US')


# When THE is last word, move to front
# TEMP: Remove all THE
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'^THE\b', '')
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'\bTHE$', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'^THE\b', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'\bTHE$', '')

#More radical: remove all special characters:
patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'[^A-Z\s0-9\&]', '')
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'[^A-Z\s0-9\&]', '')

#Most radical: remove all non-alpha characters
#patent_data['organization_clean'] = patent_data['organization_clean'].str.replace(r'[^A-Z]', '')
#kcmo['business_name_clean'] = kcmo['business_name_clean'].str.replace(r'[^A-Z]', '')

# Clean
patent_data['organization_clean'] = patent_data['organization_clean'].str.strip()
kcmo['business_name_clean'] = kcmo['business_name_clean'].str.strip()

In [11]:
#string = "THE"
#kcmo[kcmo['Business Name'].str.contains(string)]
#patent_data[patent_data['organization'].str.contains(string)]

In [12]:
kcmo_patents = pd.merge(kcmo, patent_data, how = 'left'
                          , left_on = 'business_name_clean'
                          , right_on = 'organization_clean')

In [13]:
kcmo_patents.describe(include = 'all')

Unnamed: 0,fstrBusinessActivity,fstrAddress,fstrlegalName,fstrDBAName,fdtmfilingPeriod,business_name_clean,id,patent_id,type,number,country,date,assignee_id,name_first,name_last,organization,organization_clean
count,1345435,1345435,1345435,123422,1345435,1345435,1249119,1249119,1249119,1249119,1249119,1249119,1249119,0.0,0.0,1249119,1249119
unique,626,30654,33593,10351,6,33589,197726,202195,18,197722,1,11572,525,0.0,0.0,525,514
top,Consumer Electronics Repair and Maintenance,PO BOX 60320 FORT MYERS FL 33906-6320,GENERAL ELECTRIC COMPANY,NCR,12/31/17,GENERAL ELECTRIC CO,8,8417380,11,8,US,2013-03-15,e9392087a873d8607cede39e8fbb26e6,,,GENERAL ELECTRIC COMPANY,GENERAL ELECTRIC CO
freq,315747,207717,206925,18440,287829,206925,11414,313,170753,15468,1249119,2024,206925,,,206925,206925


## API for Patent Descriptions

In [355]:
parameters = '{"_and":[{"_gte": {"patent_date": "2014-1-1"}},{"_lt": {"patent_date": "2014-2-25"}}]} '
fields='["patent_number", "patent_date", "patent_abstract", "patent_title"]'
sort= '[{"patent_date": "asc"}]'
options= '{"page":4,"per_page": 10000}'

In [356]:
base_url = 'http://www.patentsview.org/api/patents/'
url = '{}query?q={}&f={}&s={}&o={}'.format(base_url, parameters, fields, sort, options)

In [357]:
response = requests.get(url)
response = json.loads(response.text)

In [362]:
response['total_patent_count']

37853

In [348]:
DataFrame.from_dict(response['patents'])

Unnamed: 0,patent_abstract,patent_date,patent_number,patent_title
0,A method of determining a color transformation...,2014-02-11,8649581,Colour management for biological samples
1,"According to one embodiment, a plaque region e...",2014-02-11,8649582,Plaque region extracting method and apparatus ...
2,A pupil detection device and a pupil detection...,2014-02-11,8649583,Pupil detection device and pupil detection method
3,A medical image processing apparatus includes ...,2014-02-11,8649584,Medical image processing apparatus and medical...
4,A method and system for retrospective image co...,2014-02-11,8649585,Method and system for retrospective image comb...
5,A color flow gain adjustment method and device...,2014-02-11,8649586,Color flow gain adjustment method and device
6,Several related inventions for estimating scat...,2014-02-11,8649587,"Methods, systems, and computer-program product..."
7,A method for canceling the impact of the physi...,2014-02-11,8649588,Method for canceling impact of physical proper...
8,Embodiments herein provide methods for noninva...,2014-02-11,8649589,Noninvasive assessment of keratinocytes
9,A paper sheet management apparatus includes: a...,2014-02-11,8649590,"Paper sheet management apparatus, paper sheet ..."


In [256]:
import re
re.sub('<[^>]*>', '', DataFrame.from_dict(response['patents'])['patent_abstract'][596])

'"A system method that adapts wireless link parameters for a wireless communication link. A measure is determined of errors signal quality occurring in communication over a wireless link. In a case that the measure of errors signal quality corresponds to more errors than a first predetermined threshold, communication changes from a first set of wireless link parameters to a second set of wireless link parameters. The second set of wireless link parameters corresponds to higher error tolerance than the first set of wireless link parameters. In a case that the measure of errors signal quality corresponds to fewer errors than a second predetermined threshold, communication changes from the first set of wireless link parameters to a third set of wireless link parameters. The third set of wireless link parameters corresponds to lower error tolerance than the first set of wireless link parameters. Preferably, the measure of errors signal quality is determined by monitoring a number of NACK m