In [None]:
import pandas as pd
import numpy as np
import requests
import concurrent.futures
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from PIL import Image
import io
import hdbscan
from datasketch import MinHash, MinHashLSH
from sklearn.decomposition import PCA
#folosim modelul ResNet50
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')


def download_logo(url):
    try: #folosind requests incerc sa accesez un url, daca reusesc fac conversie a imaginii in RGB
        response = requests.get(url, stream=True, timeout=5)
        if response.status_code == 200:
            return Image.open(response.raw).convert('RGB')
    except requests.RequestException:
        return None
    return None

#Descarcam imaginile in paralel pe 10 threaduri pt optimizare
def download_logos_parallel(domains):
    urls = [f"https://logo.clearbit.com/{d}" for d in domains]
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        images = list(executor.map(download_logo, urls))

  #iau fiecare domeniu si imagine, si daca am gasit imagini valide atunci le pun in vectorul valid_data sub forma de perechi de elemente, domeniu, imagine
  #daca nu am imagine valida, le pun in "skipped_domains"(l-am folosit pentru a face debug).
    valid_data = []
    skipped_domains = []
    for d,img in zip(domains, images):
      if img is not None:
        valid_data.append((d,img))
      else:
        skipped_domains.append(d)

    #print(f"Număr de logo-uri descărcate: {len(valid_data)}")
    #print(f"Domenii fără logo: {len(skipped_domains)}")
    #print(skipped_domains)

    return valid_data

#transform imaginile intr-un array pentru a le procesa modelul ResNet50
def extract_features_batch(images):
    img_arrays = np.vstack([
        preprocess_input(np.expand_dims(image.img_to_array(img.resize((224, 224))), axis=0))
        for img in images
    ])
    return model.predict(img_arrays)

#Creez un minhash pentru fiecare vector de features.
def compute_minhash(feature_vector, num_perm=128):
    minhash = MinHash(num_perm=num_perm)
    for val in feature_vector[:300]:
        minhash.update(str(val).encode('utf8'))
    return minhash

#Citesc primele 1000 domenii(pentru testare mai rapida) si le pun intr-o lista.
df = pd.read_parquet("logos.snappy.parquet").head(1000)
domains = df['domain'].tolist()

#descarc logourile
valid_data = download_logos_parallel(domains)

if not valid_data:
    print("Eroare: Nu s-au descărcat logo-uri valide!")
    exit()

valid_domains, images = zip(*valid_data)

#Extrage caracteristicile imaginilor
features_list = extract_features_batch(images)

#folosim lsh pentru a gasi mai rapid grupurile asemanatoare.
lsh = MinHashLSH(threshold=0.5, num_perm=128)
hashes = {str(i): compute_minhash(features_list[i]) for i in range(len(features_list))}
for i, h in hashes.items():
    lsh.insert(i, h)

#cream grupuri cu lsh
clusters = {}
for i in range(len(features_list)):
    similar_logos = lsh.query(hashes[str(i)])
    cluster_id = tuple(sorted(similar_logos))
    clusters[cluster_id] = clusters.get(cluster_id, []) + [valid_domains[i]]

#transformam grupurile in array-uri ca sa aplicam hdbscan pentru a grupa logourile.
cluster_vectors = np.array([features_list[int(i)] for i in hashes.keys()])
hdbscan_cluster = hdbscan.HDBSCAN(metric='euclidean', min_cluster_size=2)
labels = hdbscan_cluster.fit_predict(cluster_vectors)

#afisam grupurile.
final_clusters = {}
for idx, label in enumerate(labels):
    if label != -1:
        final_clusters.setdefault(label, []).append(valid_domains[idx])
    else:
        # Daca este noise (label == -1), adaugă domeniul la grupul "Noise".
        final_clusters.setdefault('Noise', []).append(valid_domains[idx])

for cluster_id, domains in final_clusters.items():
    print(f"Group {cluster_id}: {domains}")


Număr de logo-uri descărcate: 841
Domenii fără logo: 159
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 5s/step




Group Noise: ['stanbicbank.co.zw', 'astrazeneca.ua', 'autosecuritas-ct-seysses.fr', 'ovb.ro', 'toyota-buchreiter-eisenstadt.at', 'ebay.cn', 'ccusa.co.nz', 'tupperware.at', 'ymcasteuben.org', 'engie.co.uk', 'lidl.com.cy', 'freseniusmedicalcare.ca', 'avis.cr', 'cafelasmargaritas.es', 'julis-sh.de', 'oil-testing.de', 'menschenfuermenschen.at', 'europa-union-sachsen-anhalt.de', 'pirtek.be', 'unicharm.com.br', 'rsmpoland.pl', 'esseskincare.sg', 'trinkgutlippstadt.de', 'avoncameroun.com', 'mateco.nl', 'orient-food.sk', 'noiseassessments.org', 'williamblairfunds.com', 'mazdaofamarillo.com', 'kapanfastigheter.se', 'rheine.schule', 'linexps.com', 'kalyanmachines.com', 'worldvision.ca', 'avia.ch', 'ibc-solar.pl', 'linexofschererville.com', 'webster.nl', 'compo.de', 'pkf-hk.com', 'besins-healthcare.se', 'renaultmerida.com.mx', 'tbwa.co.za', 'peugeot.sk', 'siebtechnik-tema.cn', 'mazdabali.co.id', 'rsmthailand.com', 'strabag-pfs.com', 'meggle-bakery.com', 'nutriciababy.be', 'pcllawyersfrankston.com

In [None]:
!pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5
