In [990]:
import pandas as pd
import numpy as np
from sklearn.cluster import HDBSCAN
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cdist

## Importazione dati di lavoro
Questa sezione andrà riscritta quando faremo i collegamenti client-server e da/verso il database

### Importiamo i dati
Qui sarà la parte dove importiamo i dati dal database

In [None]:
df1 = pd.read_csv("dataset.csv")
df = df1.drop(columns=["age_flag", "age_radius", "interest_flag", "gender",
                       "distance_km", "gender_text", "gender_partner_text",
                       "gender_partner", "gender_partner_him", "gender_partner_her", "gender_partner_them"])
df.describe()

Unnamed: 0,ID,sports,tvsports,exercise,dining,art,hiking,gaming,clubbing,reading,...,ambition_important,attractive,sincere,intelligence,funny,ambition,age,age_o,longitude,latitude
count,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,...,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0,8637.0
mean,4319.0,6.19347,4.786268,5.7146,6.449346,6.173093,5.309367,3.735904,6.034619,6.336923,...,1.960287,6.063332,7.199606,6.700127,6.395276,6.475975,26.357068,26.375709,25.515056,68.535928
std,2493.431471,1.95182,2.119886,1.824372,1.305078,1.544696,1.970879,1.54763,1.932471,1.477309,...,0.484997,1.288077,1.001844,1.56662,1.085747,1.535223,3.552439,3.537037,422.011686,1052.191116
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,18.0,18.0,6.908039,36.503938
25%,2160.0,5.0,3.0,5.0,6.0,6.0,4.0,2.0,5.0,6.0,...,2.0,5.0,7.0,6.0,6.0,6.0,24.0,24.0,10.229211,41.047017
50%,4319.0,7.0,4.0,6.0,7.0,6.0,6.0,3.0,6.0,7.0,...,2.0,6.0,7.0,7.0,6.0,7.0,26.0,26.0,11.716048,44.124458
75%,6478.0,8.0,7.0,7.0,7.0,7.0,7.0,5.0,8.0,7.0,...,2.0,7.0,8.0,8.0,7.0,8.0,28.0,28.0,13.974181,45.33739
max,8637.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,5.0,10.0,10.0,10.0,10.0,10.0,55.0,55.0,16897.0,45592.0


### Creazione del punto di ricerca
Per il momento prendiamo un dato a caso dal dataset come nostro candidato

In [992]:
search = df.sample()
print(search)

        ID  sports  tvsports  exercise  dining  art  hiking  gaming  clubbing  \
4033  4034     8.0       8.0       6.0     6.0  6.0     4.0     4.0       5.0   

      reading  ...  ambition_important  attractive  sincere  intelligence  \
4033      5.0  ...                 2.0         4.0      6.0           5.0   

      funny  ambition  age  age_o  longitude   latitude  
4033    5.0       6.0   22     27  11.967891  45.915649  

[1 rows x 45 columns]


Costruiamo un punto ad-hoc e salviamo i pesi

In [993]:
search["age"] = search["age_o"]
search["sports"] = search["sports_partner"]
search["tvsports"] = search["tvsports_partner"]
search["exercise"] = search["exercise_partner"]
search["dining"] = search["dining_partner"]
search["hiking"] = search["hiking_partner"]
search["gaming"] = search["gaming_partner"]
search["clubbing"] = search["clubbing_partner"]
search["reading"] = search["reading_partner"]
search["tv"] = search["tv_partner"]
search["theater"] = search["theater_partner"]
search["movies"] = search["movies_partner"]
search["music"] = search["music_partner"]
search["shopping"] = search["shopping_partner"]
search["yoga"] = search["yoga_partner"]
weights = search[['attractive_important', 'sincere_important',
                  'intelligence_important', 'funny_important',
                  'ambition_important']].to_numpy(dtype=np.uint8)[0]

In [994]:
search = search.drop(columns=(["age_o"] + list(df.filter(regex='.*partner$')) + list(df.filter(regex='.*important$'))))
print(search)
print(search.columns)

        ID  sports  tvsports  exercise  dining  art  hiking  gaming  clubbing  \
4033  4034     3.0       6.0       2.0     6.0  6.0     2.0     4.0       4.0   

      reading  ...  shopping  yoga  attractive  sincere  intelligence  funny  \
4033      2.0  ...       8.0   7.0         4.0      6.0           5.0    5.0   

      ambition  age  longitude   latitude  
4033       6.0   27  11.967891  45.915649  

[1 rows x 24 columns]
Index(['ID', 'sports', 'tvsports', 'exercise', 'dining', 'art', 'hiking',
       'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'music',
       'shopping', 'yoga', 'attractive', 'sincere', 'intelligence', 'funny',
       'ambition', 'age', 'longitude', 'latitude'],
      dtype='object')


In [995]:
search.iloc[:, 16:21] = 0
print(search)
print(weights)

        ID  sports  tvsports  exercise  dining  art  hiking  gaming  clubbing  \
4033  4034     3.0       6.0       2.0     6.0  6.0     2.0     4.0       4.0   

      reading  ...  shopping  yoga  attractive  sincere  intelligence  funny  \
4033      2.0  ...       8.0   7.0           0        0             0      0   

      ambition  age  longitude   latitude  
4033         0   27  11.967891  45.915649  

[1 rows x 24 columns]
[2 2 3 3 2]


  search.iloc[:, 16:21] = 0


Prepariamo il dataset alla ricerca.

Salviamo gli ID utenti per poter recuperare tutte le loro informazioni alla fine
Prepariamo il dataset alla ricerca. 

In [996]:
df = df[df["ID"] != search.iloc[0, 0]]
IDs = df["ID"].to_list()
IDs.append(search.iloc[0,0])
IDs = np.array(IDs)
df = df.drop(columns=(["age_o", "ID"] + list(df.filter(regex='.*partner$')) + list(df.filter(regex='.*important$'))))
print(df.columns)
search = search.drop(columns=["ID"])
print(search.columns)

Index(['sports', 'tvsports', 'exercise', 'dining', 'art', 'hiking', 'gaming',
       'clubbing', 'reading', 'tv', 'theater', 'movies', 'music', 'shopping',
       'yoga', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition',
       'age', 'longitude', 'latitude'],
      dtype='object')
Index(['sports', 'tvsports', 'exercise', 'dining', 'art', 'hiking', 'gaming',
       'clubbing', 'reading', 'tv', 'theater', 'movies', 'music', 'shopping',
       'yoga', 'attractive', 'sincere', 'intelligence', 'funny', 'ambition',
       'age', 'longitude', 'latitude'],
      dtype='object')


Convertiamo tutto in numpy array per un'elaborazione numerica più facile, il punto di ricerca sarà l'ultimo punto

In [997]:
search = search.to_numpy()[0]
X = df.to_numpy()
X = np.vstack((X,search))
print("X:",X, X.shape)

X: [[ 2.          7.          5.         ... 33.          9.96261679
  45.70627505]
 [ 6.          3.          6.         ... 28.          7.738712
  43.744764  ]
 [ 4.          3.          2.         ... 27.         18.061335
  40.609047  ]
 ...
 [ 3.          4.          8.         ... 33.          9.992847
  44.197201  ]
 [ 9.          7.          7.         ... 28.          7.45576
  45.101177  ]
 [ 3.          6.          2.         ... 27.         11.967891
  45.915649  ]] (8637, 23)


## Clustering

In [998]:
clustering = HDBSCAN(min_cluster_size=6, n_jobs=-1, store_centers="centroid", allow_single_cluster=True)
clustering.fit(X[:, np.r_[0:15, 20:23]])

# ri-assegnamo i punti rumorosi
centroids = clustering.centroids_
print("Medoids shape:", centroids.shape)
noisy = X[clustering.labels_ == -1]
noisy = noisy[:,np.r_[0:15, 20:23]]
print("Noisy points shape:",noisy.shape)
distmat = cdist(noisy, centroids)
idxs = np.argmin(distmat, axis=1)
clustering.labels_[clustering.labels_ == -1] = idxs

# filtriamo
print("X before:",X.shape)
X = X[clustering.labels_ == clustering.labels_[-1]]
IDs = IDs[clustering.labels_ == clustering.labels_[-1]]
print("X after:",X.shape)
print(X)

Medoids shape: (544, 18)
Noisy points shape: (1473, 18)
X before: (8637, 23)
X after: (10, 23)
[[ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        2.        4.        3.        2.        3.       30.
  11.287139 45.670246]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        2.        4.        3.        2.        3.       30.
  12.485743 45.848915]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        2.        4.        3.        2.        3.       30.
   9.284279 38.939535]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        2.        4.        3.        2.        3.       30.
  11.223067 46.484075]
 [ 4.        4.      

## Deformazione delle variabili
Facciamo valere le preferenze

In [999]:
exps = [(1/5), (1/1.3), 1, 2, 5]
for cnt, i in enumerate(range(15,20)):
    X[:-1,i] = np.int64(np.float_power((11-X[:-1,i]), exps[weights[cnt]-1]))
print(X)

[[ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        5.        4.        8.        9.        4.       30.
  11.287139 45.670246]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        5.        4.        8.        9.        4.       30.
  12.485743 45.848915]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        5.        4.        8.        9.        4.       30.
   9.284279 38.939535]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.        7.        6.
   6.        5.        4.        8.        9.        4.       30.
  11.223067 46.484075]
 [ 4.        4.        2.        4.        4.        3.        4.
   4.        5.        7.        5.        6.     

## Nearest Neighbor
È arrivato il momento tanto atteso di trovare i punti più simili a quello di ricerca

In [1001]:
nn = NearestNeighbors(n_jobs=-1)
nn.fit(X)

dsts, idxs = nn.kneighbors(X[-1].reshape(1,-1), return_distance=True, n_neighbors=6)
print("Indici:",idxs)
print("Distanze:",dsts)
print(X[idxs])

Indici: [[9 1 0 5 3 4]]
Distanze: [[ 0.         16.40952846 16.41717533 16.42093536 16.42796003 16.43273064]]
[[[ 3.        6.        2.        6.        6.        2.        4.
    4.        2.        4.        3.        2.        8.        8.
    7.        0.        0.        0.        0.        0.       27.
   11.967891 45.915649]
  [ 4.        4.        2.        4.        4.        3.        4.
    4.        5.        7.        5.        6.        7.        6.
    6.        5.        4.        8.        9.        4.       30.
   12.485743 45.848915]
  [ 4.        4.        2.        4.        4.        3.        4.
    4.        5.        7.        5.        6.        7.        6.
    6.        5.        4.        8.        9.        4.       30.
   11.287139 45.670246]
  [ 4.        4.        2.        4.        4.        3.        4.
    4.        5.        7.        5.        6.        7.        6.
    6.        5.        4.        8.        9.        4.       30.
   12.601764 4

## Risultato

In [1002]:
with pd.option_context('display.max_columns', None):
    print("ID punto di ricerca:",IDs[-1])
    print("ID dei punti simili:",IDs[idxs[0, 1:]])
    print("Punto di ricerca:",df1[df1["ID"] == IDs[-1]])
    for i in range(1, idxs.shape[1]):
        print(f"{i}° punto più simile:",df1[df1["ID"] == IDs[idxs[0,i]]])

ID punto di ricerca: 4034
ID dei punti simili: [2795 1352 5110 4450 4793]
Punto di ricerca:         ID  sports  tvsports  exercise  dining  art  hiking  gaming  clubbing  \
4033  4034     8.0       8.0       6.0     6.0  6.0     4.0     4.0       5.0   

      reading   tv  theater  movies  music  shopping  yoga  sports_partner  \
4033      5.0  6.0      4.0     7.0    8.0       3.0   2.0             3.0   

      tvsports_partner  exercise_partner  dining_partner  art_partner  \
4033               6.0               2.0             6.0          8.0   

      hiking_partner  gaming_partner  clubbing_partner  reading_partner  \
4033             2.0             4.0               4.0              2.0   

      tv_partner  theater_partner  movies_partner  music_partner  \
4033         4.0              3.0             2.0            8.0   

      shopping_partner  yoga_partner  attractive_important  sincere_important  \
4033               8.0           7.0                   2.0              