<a href="https://colab.research.google.com/github/Ananya-AJ/CMPE255-Data-Mining/blob/main/ANN/Random_nmslib_treesANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import pickle

## Packages solving ANN

In [None]:
import annoy
import nmslib

In [None]:
df_raw = pd.read_csv('./tumor.csv')

In [None]:
feature_columns = list(df_raw.columns)[1:-1]

x_variables = df_raw.loc[:, feature_columns]
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x_variables)

data_id = np.array(range(1, df_raw.shape[0]+1))

y_labels = df_raw.iloc[:, -1]
y_labels.unique()

In [None]:
data = {
    'vectors': scaled_x,
    'id': data_id,
    'cancer_label': y_labels,
}
with open('./cancer.pkl', 'wb') as file:
    pickle.dump(data, file)

In [None]:
print('Vectors: ',data['vectors'][:10])
print('Ids: ',data['id'][:10])
print('Labels: ',data['cancer_label'][:10])

Using ANNOY for a tree based approach to ANN

In [None]:
class AnnoyIndex():
    def __init__(self, vectors, labels, metric='angular', n_trees=5):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
        self.metric = metric # angular, euclidian, manhattan, hamming, dot
        self.number_of_trees = n_trees

    def build(self):
        self.index = annoy.AnnoyIndex(self.dimension, metric=self.metric)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(self.number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.labels[i] for i in indices]

In [None]:
# build binary tree for given vectors with 'angular' metric
index = AnnoyIndex(data['vectors'], data['id'])
index.build()

In [None]:
cancer_vector, cancer_id = data['vectors'][100], data['id'][100]
similar_cancer_cases = index.query(cancer_vector)
print(f"The most similar movies to {cancer_id} are:\n {similar_cancer_cases}")
print(f"The cancer labels of the similar ids are:\n {[data['cancer_label'][i-1] for i in similar_cancer_cases]}")

Using NMSLib for HSNW approach to ANN

In [None]:
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
        
    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [None]:
index = NMSLIBIndex(data['vectors'], data['id'])
index.build()

In [None]:
cancer_vector, cancer_id = data['vectors'][100], data['id'][100]
similar_cancer_cases = index.query(cancer_vector)
print(f"The most similar movies to {cancer_id} are:\n {similar_cancer_cases}")
print(f"The cancer labels of the similar ids are:\n {[data['cancer_label'][i-1] for i in similar_cancer_cases]}")

## Random Projections from scratch

In [None]:
df_laptop = pd.read_csv('./laptop_price.csv')

In [None]:
df_laptop.shape

(1303, 13)

In [None]:
df_laptop.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


Encode and make data vectors

In [None]:
df_laptop['Weight_num'] = pd.to_numeric(df_laptop['Weight'].str.replace('kg', ''), downcast='float')

label_columns = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram','Memory', 'Gpu', 'OpSys']
encoder = LabelEncoder()
df_new = df_laptop.loc[:, label_columns].apply(encoder.fit_transform)

df_encoded = pd.concat([df_new, df_laptop[['Inches', 'Price_euros', 'Weight_num']]], axis=1)

scaler = StandardScaler()
laptop_vectors = scaler.fit_transform(df_encoded)

Take out some datapoints for testing

In [None]:
test_vectors = laptop_vectors[100: 110]
train_vectors = np.concatenate((laptop_vectors[:100],laptop_vectors[110:]))

test_laptops = df_laptop.loc[100: 110,'laptop_ID'].values
train_laptops = pd.concat([df_laptop.loc[:100], df_laptop.loc[110:]])['laptop_ID'].values

print(test_vectors[:2])
print(len(train_vectors))

[[ 0.13647704 -1.67154058  0.33540639 -0.15530253  0.794896    0.8651681
  -0.04314153 -2.20599126  0.184661    0.40877166 -0.57916614 -0.19352076]
 [ 0.13647704 -1.62498073  0.33540639 -0.15530253 -2.18356387 -0.27583801
   1.20604542 -1.9746693   0.184661    0.40877166 -1.10869014  0.09209928]]
1293


Functions for makng normal planes, index buckets and testing

In [None]:
def make_normal_planes(nbits, d):
   normal_planes = np.random.rand(nbits, d) - 0.5 

   return normal_planes

In [None]:
def make_buckets(normal_planes, input_vectors, input_labels):
   dot_vectors = [np.dot(vector, normal_planes.T) for vector in input_vectors] 
   hashed_vectors = [(vector > 0).astype(int) for vector in dot_vectors]

   buckets={}
 
   for i in range(len(hashed_vectors)):
      hash_string = ''.join(hashed_vectors[i].astype(str))
      buckets.setdefault(hash_string, []).append(input_labels[i])

   return buckets

In [None]:
def find_closest_laptops(normal_planes, buckets, test_vector):
    dot_vector = np.dot(test_vector, normal_planes.T)
    hashed_vector = (dot_vector > 0).astype(int)

    bucket_keys = list(buckets.keys())
    distances = {}
    closest_laptops = []
    closest_distance = 0

    for i in range(len(hashed_vector)+1):
        distances[i] = []

    for vec_str in bucket_keys:
        index_vector = np.array([int(bit) for bit in list(vec_str)])
        distance = np.count_nonzero(hashed_vector!=index_vector)
        laptops_in_bucket = [i for i in buckets[vec_str]]
        distances[distance].append(laptops_in_bucket)
        
    for i in range(len(hashed_vector)+1):
        if(len(distances[i]) > 0):
            closest_laptops = distances[i]
            closest_distance = i
            break
    
    return closest_laptops, closest_distance

Make index buckets and test with test data points

In [None]:
normal_planes = make_normal_planes(nbits=8, d=12)
laptop_vector_buckets = make_buckets(normal_planes=normal_planes, input_vectors=train_vectors, input_labels=train_laptops)
len(laptop_vector_buckets)

154

In [None]:
test_index = 3

closest_laptops, closest_distance = find_closest_laptops(normal_planes=normal_planes, buckets=laptop_vector_buckets, test_vector=test_vectors[test_index])

print(f'The closest laptops to {test_laptops[test_index]} are: \n {closest_laptops}')
print(closest_distance)

The closest laptops to 106 are: 
 [[119, 197, 221, 254, 308, 320, 387, 431, 450, 464, 522, 598, 677, 744, 781, 798, 913, 935, 946, 1013, 1071, 1170, 1179, 1202, 1221]]
0
