# Hands-on 3

**Execute the cell below. By running this cell, a dataset will be loaded from `patents_small.csv` file. In this notebook, you are asked to analyze this data in several ways. There are three numpy arrays in this dataset:**
- `patent_number`: a unique identifier for each patetnt
- `patent features`: a vector of 16 features describing several properties of each patent
- `category`: the category to which a patent belongs 

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('patents.csv')
df.head()
patent_number = df['publication_number'].to_numpy()
patent_features = df['patent_embedding'].to_numpy()
temp = []
for i in range(patent_features.size):
    s = str(patent_features[i])
    s1 = s.replace(r'\n', '')
    temp.append(
        np.array(s.split()[1:-1], dtype='float')[:16]
    )

patent_features = np.stack(temp)
patent_category = df['category']

In [19]:
df.head()

Unnamed: 0,publication_number,title,cpc_code,patent_embedding,category
0,US-2019250858-A1,memory controller and operating method thereof,G06F3/061,[ 0.00135472 0.01564001 -0.04858465 0.039866...,1
1,US-1000462-A,corn planter,A01C9/00,[-4.44490612e-02 2.48770583e-02 -5.62837869e-...,6
2,KR-200146416-Y1,antitheft vehicle security system,B60R25/209,[-2.53110677e-02 -2.04547048e-02 8.63679312e-...,0
3,KR-0160422-B1,a door opening and shutting apparatus and meth...,D06F37/42,[ 1.21761542e-02 1.97522007e-02 -6.62921891e-...,1
4,US-952306-A,spray burner,B05B1/3033,[-0.00214472 0.01606156 -0.09518531 0.060160...,0


In [30]:
patent_number
patent_features
patent_category

array(['US-2019250858-A1', 'US-1000462-A', 'KR-200146416-Y1', ...,
       'CA-2952951-A1', 'CH-608317-A', 'CN-100513251-C'], dtype=object)

<hr />

#### 1- Which patent has the highest norm? (Eucledian distance from origin)


In [2]:
df['norms'] = np.linalg.norm(patent_features, axis=1)
df[df['norms'] == df['norms'].max()]

Unnamed: 0,publication_number,title,cpc_code,patent_embedding,category,norms
10839,CH-527846-A,penicillanylaldehydes,C07D499/00,[-1.58957148e-03 3.83572765e-02 -1.47625625e-...,5,0.353081


#### 2- Find the two patents that are the farthest from eachother.

In [4]:
from scipy.spatial.distance import cdist

dist = cdist(patent_features, patent_features)

dist.max()

0.5612983586484407

In [5]:

np.where(dist==0.5612983586484407)

np.unravel_index(dist.argmax(), dist.shape)


(1661, 9236)

#### 3- Write a function that, given a patent number, finds its nearest neighbour.


In [8]:
find_min_dist = cdist(patent_features, patent_features)
find_min_dist[find_min_dist == 0] = np.Infinity

find_min_dist

array([[       inf, 0.245627  , 0.26009143, ..., 0.19460144, 0.20385974,
        0.21932892],
       [0.245627  ,        inf, 0.22291979, ..., 0.19293961, 0.15229353,
        0.19206826],
       [0.26009143, 0.22291979,        inf, ..., 0.14702509, 0.15017255,
        0.13727286],
       ...,
       [0.19460144, 0.19293961, 0.14702509, ...,        inf, 0.09981435,
        0.09973816],
       [0.20385974, 0.15229353, 0.15017255, ..., 0.09981435,        inf,
        0.09281058],
       [0.21932892, 0.19206826, 0.13727286, ..., 0.09973816, 0.09281058,
               inf]])

In [39]:
from scipy.spatial.distance import cdist


def find_nearest_neighbour(pat_number):
    idx_pat_number = np.where(df['publication_number'] == pat_number)[0][0]

    with_argsort = np.argsort(find_min_dist[idx_pat_number])[0]
    # with_argmin = np.unravel_index(find_min_dist[idx_pat_number].argmin(), find_min_dist.shape)[1]

    return with_argsort


nearest_neighbour = find_nearest_neighbour("KR-200146416-Y1")

nearest_neighbour

8329

In [None]:
dists = cdist(patent_features, patent_features)

(array([10839]),)


#### 4- For each patent category, find the cluster center. This quantity is computed by taking average of all patents associated with each cluster.

In [29]:
from collections import defaultdict


cat = np.unique(patent_category)

cluster_centers = defaultdict(list)

print(cluster_centers)

for k in cat:
    points = patent_features[patent_category==k]

    cluster_centers[k] = np.mean(points, axis=0)


cluster_centers


defaultdict(<class 'list'>, {})


defaultdict(list,
            {0: array([ 0.01086092, -0.02427292,  0.06917166, -0.04593048, -0.02812299,
                    -0.0124727 , -0.04987288,  0.00655626,  0.0098301 , -0.01550384,
                     0.00122531,  0.00426678,  0.00017979,  0.02210309, -0.02753392,
                    -0.00829946]),
             1: array([ 0.01021772,  0.0140427 , -0.03571764,  0.05286253, -0.04302765,
                    -0.00263517,  0.02233755, -0.04675915,  0.01272022,  0.03165236,
                     0.01146286, -0.00024609,  0.01377522,  0.00555212,  0.02024696,
                    -0.04467966]),
             2: array([ 0.01844678,  0.00991557, -0.05545595,  0.02615103, -0.07078419,
                    -0.0115121 ,  0.04539117, -0.05906673, -0.02173693,  0.00203886,
                     0.00052992,  0.02329754, -0.03247548,  0.03103352,  0.0140693 ,
                    -0.06104154]),
             3: array([ 0.01717531,  0.01595333, -0.03129371,  0.05920419, -0.05942006,
               

5- How many patents have a nearest neighbour that is in the same category?

In [42]:
nearest_neighbor_category = []
for patent in patent_number:
    nearest_patent = find_nearest_neighbour(patent)
    nearest_category = patent_category[nearest_patent]
    nearest_neighbor_category.append(nearest_category)

nearest_neighbor_category = np.array(nearest_neighbor_category)
result = np.sum(patent_category == nearest_neighbor_category)
print(result)

12927

#### 6- What is the average and std of distances between every pair of patents?


In [44]:
dists = cdist(patent_features, patent_features)
mean_pair_distance = np.mean(dists)
std_pair_distance = np.std(dists)

print("Mean: ", mean_pair_distance)
print("STD: ", std_pair_distance)


Mean:  0.1774779588870755
STD:  0.06172153433074445


#### 7- What is the average and std of distances between every pair of patents within a category? Using these calculated quantities, which cluster do you think is more condensed? Which one is more scattered?

In [45]:
distance_means, distance_stds = [], []
for i in patent_category.unique():
    cat_patents = patent_features[patent_category==i]
    within_pair_distance = cdist(cat_patents, cat_patents)
    distance_means.append(
        np.mean(within_pair_distance)
    )
    distance_stds.append(
        np.std(within_pair_distance)
    )

print(distance_means)
print(distance_stds)
print('the most scattered cluster: ', np.argmax(distance_means))
print('the most condensed cluster: ', np.argmin(distance_means))

[0.10235593475072612, 0.1040055310233456, 0.13098792252469746, 0.14095499322691948, 0.13610883703695129, 0.13874011203724912, 0.13052366960262599, 0.13459851471369638]
[0.030560683465341895, 0.03180215536017749, 0.03973015614729102, 0.04403433221841342, 0.04026897604426641, 0.04483107158434705, 0.03853325352892595, 0.04241648037596702]
the most scattered cluster:  3
the most condensed cluster:  0
