In [1]:
!git clone https://github.com/DeepikaMobileDeveloper/women-in-ai-hackathon.git

fatal: destination path 'women-in-ai-hackathon' already exists and is not an empty directory.


In [2]:
import gzip
import shutil
from pathlib import Path

data_dir = Path('women-in-ai-hackathon/data')
for gz_file in data_dir.glob('*.gz'):
   csv_file = gz_file.with_suffix('.csv')
   with gzip.open(gz_file, 'rb') as f_in:
       with open(csv_file, 'wb') as f_out:
           shutil.copyfileobj(f_in, f_out)
   print(f'Unzipped: {gz_file.name}')

Unzipped: inventory_sets.csv.gz
Unzipped: inventories.csv.gz
Unzipped: themes.csv.gz
Unzipped: sets.csv.gz
Unzipped: minifigs.csv.gz
Unzipped: part_categories.csv.gz
Unzipped: parts.csv.gz
Unzipped: colors.csv.gz
Unzipped: inventory_parts.csv.gz
Unzipped: inventory_minifigs.csv.gz
Unzipped: part_relationships.csv.gz
Unzipped: elements.csv.gz


In [3]:
!pip install pandas numpy pathlib sentence-transformers pymilvus pillow transformers torch gradio



In [4]:
# Step 1: Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
import logging
import torch
from PIL import Image
from transformers import AutoProcessor, CLIPModel, AutoModelForCausalLM, AutoTokenizer
import gradio as gr

logging.basicConfig(level=logging.INFO)

In [5]:
# Cell 0: Drop and rebuild
# Add this method to LegoVectorDB
def _init_collection(self):
    # Drop existing collection if it exists
    if Collection.loaded():
        Collection.drop_collection("lego_sets")

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="set_num", dtype=DataType.VARCHAR, max_length=20),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
        FieldSchema(name="parts_description", dtype=DataType.VARCHAR, max_length=2000)
    ]
    schema = CollectionSchema(fields=fields, description="LEGO sets")
    collection = Collection(name="lego_sets", schema=schema)
    collection.create_index(
    field_name="embedding",
    index_params={"index_type": "IVF_FLAT", "metric_type": "COSINE", "params": {"nlist": 128}}
)
    return collection

In [6]:
# Cell 2: Data Access
class LegoDatabase:
    def __init__(self, data_dir='women-in-ai-hackathon/data'):
        self.data_path = Path(data_dir)
        self.dfs = self._load_data()

    def _load_data(self):
      return {
          'parts': pd.read_csv(self.data_path / 'parts.csv.csv'),
          'sets': pd.read_csv(self.data_path / 'sets.csv.csv'),
          'inventory_parts': pd.read_csv(self.data_path / 'inventory_parts.csv.csv'),
          'inventories': pd.read_csv(self.data_path / 'inventories.csv.csv'),
          'colors': pd.read_csv(self.data_path / 'colors.csv.csv'),
          'themes': pd.read_csv(self.data_path / 'themes.csv.csv')
      }

    def get_set_parts(self, set_num):
      # Get set info
      set_info = self.dfs['sets'][self.dfs['sets']['set_num'] == set_num].iloc[0]

      # Get inventories for the set
      inventories = self.dfs['inventories'][self.dfs['inventories']['set_num'] == set_num]['id']

      # Get parts for those inventories
      parts = self.dfs['inventory_parts'][self.dfs['inventory_parts']['inventory_id'].isin(inventories)]

      # Basic merge with parts table
      parts = parts.merge(self.dfs['parts'][['part_num', 'name']], on='part_num')

      print("Parts columns after merge:", parts.columns.tolist())  # Temporary debug line

      return parts, set_info



In [7]:
# Cell 3: VectorDB
class LegoVectorDB:
    def __init__(self, uri, token):
        connections.connect(uri=uri, token=token)
        self.collection = self._init_collection()

    def _init_collection(self):
        from pymilvus import utility

        # Delete existing collection if it exists
        if utility.has_collection("lego_sets"):
            utility.drop_collection("lego_sets")

        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="set_num", dtype=DataType.VARCHAR, max_length=20),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # Match model output dimension
            FieldSchema(name="parts_description", dtype=DataType.VARCHAR, max_length=2000)
        ]
        schema = CollectionSchema(fields=fields, description="LEGO sets")
        collection = Collection(name="lego_sets", schema=schema)
        collection.create_index(
            field_name="embedding",
            index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
        )
        return collection

In [9]:
# Cell 4: RAG
class LegoRAG:
    def __init__(self, uri, token):
        self.db = LegoDatabase()
        self.vector_db = LegoVectorDB(uri=uri, token=token)
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.id_counter = 0

    def encode_set(self, parts_df):
      parts_text = " ".join([f"{row['quantity']} x {row['name']}"
                          for _, row in parts_df.iterrows()])
      embedding = self.encoder.encode(parts_text)
      return embedding, parts_text

    def index_set(self, set_num):
      parts, set_info = self.db.get_set_parts(set_num)
      if len(parts) > 0:
          embedding, description = self.encode_set(parts)
          self.vector_db.collection.insert([
              [self.id_counter],
              [set_num],
              [embedding.tolist()],
              [description]
          ])
          self.id_counter += 1
          return True
      return False

    def find_similar(self, set_num, top_k=5):
      parts, _ = self.db.get_set_parts(set_num)
      query_embedding, _ = self.encode_set(parts)
      return self.vector_db.collection.search(
          data=[query_embedding.tolist()],
          anns_field="embedding",
          param={"metric_type": "COSINE", "params": {"nprobe": 10}},
          limit=top_k,
          output_fields=["set_num", "parts_description"])

In [13]:
# Step 5: Creating Interface
def create_interface(rag):
    def process_query(set_num):
        try:
            parts, set_info = rag.db.get_set_parts(set_num)
            if parts.empty:
                return "Error: Set number not found"

            theme = rag.db.dfs['themes'][rag.db.dfs['themes']['id'] == set_info['theme_id']].iloc[0]['name']
            output = [f"Query Set: {set_num}\nTheme: {theme}\nYear: {set_info['year']}\nPieces: {set_info['num_parts']}\n\nSimilar Sets:\n"]

            results = rag.find_similar(set_num)
            for hit in results[0]:
                similar_set_num = hit.entity.get('set_num')
                similar_set = rag.db.dfs['sets'][rag.db.dfs['sets']['set_num'] == similar_set_num].iloc[0]
                similar_theme = rag.db.dfs['themes'][rag.db.dfs['themes']['id'] == similar_set['theme_id']].iloc[0]['name']

                output.append(f"Set: {similar_set_num}\n"
                            f"Theme: {similar_theme}\n"
                            f"Year: {similar_set['year']}\n"
                            f"Similarity: {hit.score:.2f}\n"
                            f"Parts: {hit.entity.get('parts_description')}\n")
            return "\n".join(output)
        except Exception as e:
            return f"Error: {str(e)}"

    interface = gr.Interface(
        fn=process_query,
        inputs=gr.Textbox(label="Enter LEGO Set Number"),
        outputs=gr.Textbox(label="Similar Sets"),
        title="Brickspiration"
    )
    return interface

In [14]:
# Cell 6: Playground
if __name__ == "__main__":
    rag = LegoRAG(
        uri="",
        token=""
    )

    # Index sample sets
    sample_sets = rag.db.dfs['sets']['set_num'].head(10).tolist()
    for set_num in sample_sets:
        if rag.index_set(set_num):
            print(f"Indexed set {set_num}")

    # Launch interface
    interface = create_interface(rag)
    interface.launch(share=True)

Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 0003977811-1
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 001-1
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 0012-1
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 0013-1
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 0014-1
Parts columns after merge: ['inventory_id', 'part_num', 'color_id', 'quantity', 'is_spare', 'img_url', 'name']
Indexed set 

In [12]:
# Cell 7: Alternate Idea
import torch
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

class LegoSemanticSearch:
    def __init__(self, data_dir='women-in-ai-hackathon/data',
                 uri="",
                 token=""):
        # Load data
        self.data_path = Path(data_dir)
        self.dfs = self._load_data()

        # Initialize embedding model
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')

        # Connect to Milvus vector database
        connections.connect(uri=uri, token=token)
        self.collection = self._init_collection()

        # ID counter for vector database
        self.id_counter = 0

    def _load_data(self):
        files = ['parts', 'sets', 'inventory_parts', 'inventories', 'colors', 'themes']
        return {f: pd.read_csv(self.data_path / f'{f}.csv.csv') for f in files}

    def _init_collection(self):
        from pymilvus import utility

        if utility.has_collection("lego_semantic_sets"):
            utility.drop_collection("lego_semantic_sets")

        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
            FieldSchema(name="set_num", dtype=DataType.VARCHAR, max_length=20),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
            FieldSchema(name="set_description", dtype=DataType.VARCHAR, max_length=2000)
        ]
        schema = CollectionSchema(fields=fields, description="LEGO sets semantic search")
        collection = Collection(name="lego_semantic_sets", schema=schema)
        collection.create_index(
            field_name="embedding",
            index_params={
                "index_type": "IVF_FLAT",
                "metric_type": "COSINE",
                "params": {"nlist": 128}
            }
        )
        collection.load()
        return collection

    def create_set_description(self, set_row):
        """
        Create a descriptive text for a LEGO set
        """
        theme = self.dfs['themes'][self.dfs['themes']['id'] == set_row['theme_id']].iloc[0]['name']
        return f"{set_row['name']} - {theme} set from {set_row['year']} (Pieces: {set_row['num_parts']})"

    def index_sets(self, limit=None):
        """
        Index LEGO sets into vector database
        """
        sets_df = self.dfs['sets']
        if limit:
            sets_df = sets_df.head(limit)

        for _, set_row in sets_df.iterrows():
            description = self.create_set_description(set_row)
            embedding = self.encoder.encode(description)

            self.collection.insert([
                [self.id_counter],
                [set_row['set_num']],
                [embedding.tolist()],
                [description]
            ])
            self.id_counter += 1

        self.collection.flush()

    def semantic_search(self, query, top_k=5):
        """
        Perform semantic search on LEGO sets
        """
        # Encode query
        query_embedding = self.encoder.encode(query)

        # Search in vector database
        results = self.collection.search(
            data=[query_embedding.tolist()],
            anns_field="embedding",
            param={"metric_type": "COSINE", "params": {"nprobe": 10}},
            limit=top_k,
            output_fields=["set_num", "set_description"]
        )

        # Process and return results
        return [
            {
                'set_num': hit.entity.get('set_num'),
                'description': hit.entity.get('set_description'),
                'similarity': hit.score
            }
            for hit in results[0]
        ]

# Example usage
def main():
    # Initialize Lego Semantic Search
    lego_search = LegoSemanticSearch()

    # Index sets
    lego_search.index_sets(limit=100)

    # Perform semantic searches
    queries = [
        "Star Wars spaceship",
        "Castle medieval",
        "Technic vehicle",
        "Space exploration"
    ]

    for query in queries:
        print(f"\nSearch Query: {query}")
        results = lego_search.semantic_search(query)

        for result in results:
            print(f"Set: {result['set_num']} | Description: {result['description']} | Similarity: {result['similarity']:.4f}")

if __name__ == "__main__":
    main()


Search Query: Star Wars spaceship
Set: 0241357594-1 | Description: Star Wars: Build Your Own Adventure: Galactic Missions - Activity Books with LEGO Parts set from 2019 (Pieces: 70) | Similarity: 0.4674
Set: 0878119001641-1 | Description: Star Wars Battle Bridge storage case - Storage set from 2012 (Pieces: 0) | Similarity: 0.4055
Set: 1000368666-1 | Description: Batman the Videogame - PSP - Video Games and Accessories set from 2008 (Pieces: 0) | Similarity: 0.2974
Set: 1000430096-8 | Description: Marvel Super Heroes - PS4 - Video Games and Accessories set from 2013 (Pieces: 0) | Similarity: 0.2855
Set: 0015-1 | Description: Space Mini-Figures - Supplemental set from 1979 (Pieces: 18) | Similarity: 0.2855

Search Query: Castle medieval
Set: 0016-1 | Description: Castle Mini Figures - Classic Castle set from 1979 (Pieces: 15) | Similarity: 0.4971
Set: 0011-3 | Description: Castle 2 for 1 Bonus Offer - Lion Knights set from 1987 (Pieces: 0) | Similarity: 0.4387
Set: 10000-1 | Descriptio