In [None]:

from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from technologies import technologies\

all_docs = []
for category in ["frontend", "backend", "tools"]:
    for tech in technologies[category]:
        all_docs.append(
            Document(
                page_content=tech,
                metadata={"category": category}
            )
        )
print(all_docs)


[Document(metadata={'category': 'frontend'}, page_content='React'), Document(metadata={'category': 'frontend'}, page_content='Angular'), Document(metadata={'category': 'frontend'}, page_content='Vue.js'), Document(metadata={'category': 'frontend'}, page_content='Svelte'), Document(metadata={'category': 'frontend'}, page_content='Preact'), Document(metadata={'category': 'frontend'}, page_content='Ember.js'), Document(metadata={'category': 'frontend'}, page_content='Backbone.js'), Document(metadata={'category': 'frontend'}, page_content='jQuery'), Document(metadata={'category': 'frontend'}, page_content='SolidJS'), Document(metadata={'category': 'frontend'}, page_content='Qwik'), Document(metadata={'category': 'frontend'}, page_content='Alpine.js'), Document(metadata={'category': 'frontend'}, page_content='Bootstrap'), Document(metadata={'category': 'frontend'}, page_content='Tailwind CSS'), Document(metadata={'category': 'frontend'}, page_content='Materialize'), Document(metadata={'cate

In [2]:

vectorstore = Chroma.from_documents(
    all_docs,
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    persist_directory="tech_dump_vector_store"
)
print("Single Chroma collection created with", len(all_docs), "technologies across frontend, backend, and tools.")

  from .autonotebook import tqdm as notebook_tqdm


Single Chroma collection created with 181 technologies across frontend, backend, and tools.


In [4]:
from langchain_chroma import Chroma
vector_presist=Chroma(
    persist_directory="tech_dump_vector_store",
    embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:

res=vector_presist.similarity_search_with_score(
    query="API",
    k=5
)
print(res)


[(Document(id='92ec566c-7fd2-479c-821c-4c0ba8629a11', metadata={'category': 'backend'}, page_content='RESTful APIs'), 0.6424062252044678), (Document(id='085e91ab-9234-47b3-9242-f6d58f7ec188', metadata={'category': 'tools'}, page_content='Swagger / OpenAPI'), 1.039792776107788), (Document(id='38e913f0-0f10-40f1-8e71-2e560c90f977', metadata={'category': 'tools'}, page_content='Datadog'), 1.2182546854019165), (Document(id='04be0715-f16b-40ad-8a77-7892b750d49d', metadata={'category': 'backend'}, page_content='OAuth 2.0'), 1.2219806909561157), (Document(id='e9a0620f-7d66-470c-80ab-6c429ba73cf7', metadata={'category': 'backend'}, page_content='FastAPI'), 1.275020718574524)]


# Project-Skill Matching Strategy

## Approach:
1. **Technology Reference Store**: Store all available technologies (completed above)
2. **Project Tech Stacks**: Store each project with its specific tech requirements
3. **Skill Matching**: Match person's skills against project requirements using semantic similarity

## Two Types of Matching:
- **Direct Match**: Person has exact technology (React ‚Üí React)
- **Semantic Match**: Person has related skills (Vue.js ‚Üí React, both frontend frameworks)

In [None]:
# Let's create project tech stack storage
from langchain.schema import Document

# Example projects with their tech stacks
example_projects = [
    {
        "name": "E-commerce Platform",
        "description": "A modern e-commerce platform with real-time features",
        "tech_stack": ["React", "Node.js", "MongoDB", "Express.js", "Redux", "Docker", "AWS"],
        "difficulty": "intermediate",
        "type": "web_application"
    },
    {
        "name": "Machine Learning Pipeline",
        "description": "Data processing and ML model deployment system",
        "tech_stack": ["Python", "FastAPI", "PostgreSQL", "Docker", "Kubernetes", "TensorFlow", "Redis"],
        "difficulty": "advanced",
        "type": "ml_project"
    },
    {
        "name": "Mobile Chat App",
        "description": "Cross-platform mobile messaging application",
        "tech_stack": ["React Native", "Node.js", "Socket.io", "MongoDB", "Firebase", "JWT"],
        "difficulty": "intermediate",
        "type": "mobile_app"
    },
    {
        "name": "DevOps Automation Tool",
        "description": "Infrastructure automation and monitoring dashboard",
        "tech_stack": ["Go", "Docker", "Kubernetes", "Prometheus", "Grafana", "Terraform", "PostgreSQL"],
        "difficulty": "advanced",
        "type": "devops_tool"
    }
]

# Create documents for project tech stacks
project_docs = []
for project in example_projects:
    # Create a document with all technologies as content for semantic search
    tech_content = " ".join(project["tech_stack"])
    project_docs.append(
        Document(
            page_content=f"{project['name']}: {project['description']} Technologies: {tech_content}",
            metadata={
                "project_name": project["name"],
                "tech_stack": project["tech_stack"],
                "difficulty": project["difficulty"],
                "type": project["type"],
                "tech_count": len(project["tech_stack"])
            }
        )
    )

print("Project documents created:")
for doc in project_docs:
    print(f"- {doc.metadata['project_name']}: {len(doc.metadata['tech_stack'])} technologies")

In [None]:
# Create separate project vector store
project_vectorstore = Chroma.from_documents(
    project_docs,
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    persist_directory="project_vector_store"
)
print(f"Project vector store created with {len(project_docs)} projects")

In [None]:
# Skill Matching Function
def match_person_to_projects(person_skills, top_k=3):
    """
    Match a person's skills to the most suitable projects
    
    Args:
        person_skills (list): List of person's technical skills
        top_k (int): Number of top projects to return
    
    Returns:
        list: Matched projects with scores and analysis
    """
    # Create a query string from person's skills
    skills_query = " ".join(person_skills)
    
    # Search for matching projects
    results = project_vectorstore.similarity_search_with_score(
        query=skills_query,
        k=top_k
    )
    
    matched_projects = []
    for doc, score in results:
        project_tech_stack = doc.metadata['tech_stack']
        
        # Calculate direct matches
        direct_matches = set(person_skills) & set(project_tech_stack)
        missing_skills = set(project_tech_stack) - set(person_skills)
        
        match_info = {
            "project_name": doc.metadata['project_name'],
            "similarity_score": round(1 - score, 3),  # Convert distance to similarity
            "direct_matches": list(direct_matches),
            "missing_skills": list(missing_skills),
            "match_percentage": round(len(direct_matches) / len(project_tech_stack) * 100, 1),
            "difficulty": doc.metadata['difficulty'],
            "project_type": doc.metadata['type'],
            "total_tech_count": doc.metadata['tech_count']
        }
        matched_projects.append(match_info)
    
    return matched_projects

# Example: Person's skills
person_skills_example = ["Python", "React", "Docker", "PostgreSQL", "FastAPI"]

print("Person's Skills:", person_skills_example)
print("\n" + "="*50)
print("MATCHING PROJECTS:")
print("="*50)

matches = match_person_to_projects(person_skills_example)
for i, match in enumerate(matches, 1):
    print(f"\n{i}. {match['project_name']}")
    print(f"   Similarity Score: {match['similarity_score']}")
    print(f"   Direct Matches ({match['match_percentage']}%): {match['direct_matches']}")
    print(f"   Missing Skills: {match['missing_skills']}")
    print(f"   Difficulty: {match['difficulty']}")
    print(f"   Type: {match['project_type']}")

In [None]:
# Advanced Skill Matching with Semantic Expansion
def find_similar_technologies(skill, top_k=3):
    """Find semantically similar technologies for a given skill"""
    results = vector_presist.similarity_search_with_score(
        query=skill,
        k=top_k
    )
    return [(doc.page_content, round(1-score, 3)) for doc, score in results]

def enhanced_skill_matching(person_skills, expand_skills=True):
    """
    Enhanced matching that includes semantic similarity
    """
    print("=== ENHANCED SKILL MATCHING ===")
    print(f"Original Skills: {person_skills}")
    
    if expand_skills:
        print("\nSemantic Skill Expansion:")
        expanded_skills = set(person_skills)
        
        for skill in person_skills:
            similar_techs = find_similar_technologies(skill, top_k=2)
            print(f"  {skill} ‚Üí Similar: {[tech for tech, score in similar_techs if tech != skill]}")
            # Add similar technologies with high similarity (> 0.7)
            for tech, score in similar_techs:
                if score > 0.7 and tech not in person_skills:
                    expanded_skills.add(tech)
        
        expanded_skills = list(expanded_skills)
        print(f"\nExpanded Skills: {expanded_skills}")
    else:
        expanded_skills = person_skills
    
    # Match with expanded skills
    matches = match_person_to_projects(expanded_skills)
    
    print(f"\n{'='*60}")
    print("PROJECT RECOMMENDATIONS:")
    print('='*60)
    
    for i, match in enumerate(matches, 1):
        print(f"\nüéØ {i}. {match['project_name']}")
        print(f"   üìä Match: {match['match_percentage']}% | Similarity: {match['similarity_score']}")
        print(f"   ‚úÖ You Know: {match['direct_matches']}")
        print(f"   üìö Learn: {match['missing_skills'][:3]}{'...' if len(match['missing_skills']) > 3 else ''}")
        print(f"   üéöÔ∏è  Difficulty: {match['difficulty']} | Type: {match['project_type']}")
    
    return matches

# Test with different skill sets
print("EXAMPLE 1: Full-Stack Developer")
fullstack_skills = ["React", "Node.js", "MongoDB", "Express.js"]
enhanced_skill_matching(fullstack_skills)

In [None]:
print("\n" + "="*60)
print("EXAMPLE 2: DevOps Engineer")
devops_skills = ["Docker", "Kubernetes", "Python", "Terraform"]
enhanced_skill_matching(devops_skills)

print("\n" + "="*60)
print("EXAMPLE 3: Frontend Specialist")
frontend_skills = ["Vue.js", "TypeScript", "Sass"]
enhanced_skill_matching(frontend_skills)

## Summary: Vector Database Strategy for Project-Skill Matching

### ‚úÖ What We've Built:

1. **Technology Reference Store** (`tech_dump_vector_store/`)
   - Contains all available technologies from your `technologies.py`
   - Used for semantic similarity searches
   - Helps find related technologies (Vue.js ‚Üî React)

2. **Project Vector Store** (`project_vector_store/`)
   - Stores each project with its tech stack requirements
   - Includes project metadata (difficulty, type, etc.)
   - Enables project discovery based on skills

3. **Smart Matching System**
   - **Direct Matching**: Exact skill matches
   - **Semantic Matching**: Related technology matching
   - **Skill Expansion**: Finds similar technologies you might also know
   - **Gap Analysis**: Shows what skills you need to learn


### üéØ How to Use This for Your OpenSource Hunt:

1. **For Project Maintainers**: Store their project's tech stack in the project vector store
2. **For Contributors**: Input their skills and get matched projects
3. **For Skill Development**: See what technologies to learn for desired projects

### üöÄ Next Steps:
- Run the cells above to see the matching in action
- Integrate this with your GitHub repository analysis
- Add more projects from real GitHub repositories
- Create a web interface for easy skill-to-project matching