In [1]:
# @title mounting and os.chdir
# This is an example of a hidden code cell.
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/ASHBoost_NumHackhackathon')

Mounted at /content/drive


# **üóÉÔ∏è Installing libraries**

In [None]:
'''
Insall the required libraries from requirements.txt
'''
!pip install -r requirements.txt

# **üìö Importing & loading Data**

In [3]:
import pandas as pd
import IPython
import os
from pyvis.network import Network
import networkx as nx
from sentence_transformers import SentenceTransformer
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity
import random
from ipywidgets import widgets
from IPython.display import display

# loading datasets
Contributors = pd.read_csv('Datasets/Contributors_with_Summary.csv') # DataFrame contains contributors information
project = pd.read_csv('Datasets/AI_Projects_Populated_with_Summary.csv') # DataFrame contains projects information

  from tqdm.autonotebook import tqdm, trange


# **‚öôÔ∏è Data Processing**

##  **Skill Matching Logic**


In [6]:
# Import required libraries
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import ast
import random

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate similarities between project requirements and contributor knowledge
def get_similarities(data_, project_):
    similarities = dict()  # Dictionary to store cosine similarity results
    count = 0  # Flag to ensure certain computations only run once

    # Iterate through each role/skill in the project's required roles
    for x in ast.literal_eval(project_['Contributor Roles Required']):
        if count == 0:  # First iteration: compute and store general similarities
            resume_embeddings = model.encode(list(data_['Knowledge Summary'].values))  # Encode contributor knowledge
            project_embedding = model.encode([project_['Summary']])[0]  # Encode project summary
            similarities['simlar'] = cosine_similarity([project_embedding], resume_embeddings)  # Compute similarity
            count = 1  # Set flag to avoid redundant computations

        # Compute similarity for each specific role/skill required by the project
        resume_embeddings = model.encode(list(data_['Knowledge Summary'].values))
        project_embedding = model.encode([x])[0]
        similarities[x] = cosine_similarity([project_embedding], resume_embeddings)

    # Add computed similarities as new columns to the DataFrame
    for x in similarities:
        data_[x] = list(similarities[x][0])

    # List to keep track of columns that will be dropped later
    to_drop = []

    # Compute average similarity (role-specific and general) and prepare to drop intermediate columns
    for x in similarities:
        if x != 'simlar':  # Skip the general similarity column
            data_[x + '_avg'] = (data_[x] + data_['simlar']) / 2  # Average similarity
            to_drop += [x]  # Mark the intermediate column for dropping

    # Drop intermediate columns to clean up the DataFrame
    data_ = data_.drop(to_drop, axis=1)

    # Add additional metadata about the project to the DataFrame
    data_['project_title'] = project_['Project Title']  # Add project title
    data_['project_skills_required'] = data_['project_title'].apply(lambda x: project_['Contributor Roles Required'])  # Add required roles
    data_['project_summary'] = data_['project_title'].apply(lambda x: project_['Summary'])  # Add project summary
    data_['name'] = data_['Portfolio or GitHub'].apply(lambda x: x.split('/')[-1])  # Extract contributor name from GitHub URL

    return data_

# Function to process all projects and combine the results into one DataFrame
def allCombined():
    bigdf = pd.DataFrame()  # Initialize an empty DataFrame for storing results
    for _, row in project.iterrows():  # Iterate through all projects
        # Concatenate the results from get_similarities for each project
        bigdf = pd.concat([bigdf, get_similarities(Contributors.copy(), row)], ignore_index=True)
    return bigdf

# Function to assign random colors to a list of skills
def assign_random_colors(skills):
    # List of available colors for assignment
    available_colors = [
        "red", "blue", "green", "yellow", "orange", "purple", "pink", "brown",
        "black", "gray", "cyan", "magenta", "teal", "indigo", "violet",
        "maroon", "olive", "lime", "navy", "beige", "silver", "gold", "coral", "lavender",
        "turquoise", "salmon"
    ]
    random.shuffle(available_colors)  # Shuffle the colors for random assignment

    # Ensure there are enough colors to assign
    if len(available_colors) < len(skills):
        raise ValueError("Not enough colors available for all skills.")

    # Assign each skill a unique color
    skill_colors = {}
    for skill in skills:
        skill_colors[skill] = available_colors[0]
        available_colors.remove(available_colors[0])  # Remove assigned color from the list

    return skill_colors

In [7]:
data = allCombined() # Calling allCombined method

## **Assigning colors to skills**

In [8]:
# Extract unique skills from the 'project_skills_required' column
# Step 1: Convert the 'project_skills_required' column (a string) to a list using ast.literal_eval
# Step 2: Flatten the lists of skills into one large list using sum() (on lists)
# Step 3: Convert the flattened list to a set to ensure uniqueness, then back to a list
skills = list(set(data['project_skills_required'].apply(lambda x: ast.literal_eval(x)).sum()))

# Assign a random unique color to each skill using the assign_random_colors function
skills_colors = assign_random_colors(skills)

## **DiGraph**

In [9]:
G = nx.DiGraph()
# adding users nodes
for _, row in data[['name','Knowledge Summary']].value_counts().reset_index().drop('count',axis=1).iterrows():
    G.add_node(row["name"], title=row["Knowledge Summary"])
# adding project nodes
for _,row in data[['project_title','project_summary']].value_counts().reset_index().drop('count',axis=1).iterrows():
  G.add_node(row['project_title'],title=row['project_summary'])
# adding edges for each role required per project
for x in list(data['project_title'].unique()):
  skills = ast.literal_eval(data.query(f"project_title == '{x}'").sample()['project_skills_required'].values[0])
  for skill in skills:
    # For each skill, find the top 3 contributors based on their average skill scores
    max = data[['project_title','name',skill+'_avg']].groupby('project_title').get_group(x)[skill+'_avg'].max()
    for _,row in data[['project_title','name',skill+'_avg']].groupby('project_title').get_group(x).sort_values(by=skill+'_avg',ascending=False).head(3).iterrows():
        G.add_edge(row["name"], row['project_title'],weight=(row[skill+'_avg']*1/max),color=skills_colors[skill])
nx.write_graphml(G, "user_project_graph.graphml")

In [10]:
# Load a directed graph from a GraphML file
G = nx.read_graphml("user_project_graph.graphml")

# Create a Pyvis Network object
nt = Network(
    height="800px",   # Set the height of the graph visualization
    width="100%",     # Set the width of the graph visualization
    directed=True,    # Specify that the graph is directed
    cdn_resources="remote",  # Use remote resources for visualization (Pyvis CDN)
    notebook=True,    # Enable Jupyter Notebook compatibility
    filter_menu=True  # Allow filtering options in the visualization
)

# Convert the NetworkX graph object to a Pyvis network
nt.from_nx(G)

# Apply the ForceAtlas2 algorithm for graph layout
# This algorithm provides a force-directed layout where spring strength is adjustable
nt.force_atlas_2based(
    spring_strength=0.01  # Set the spring strength for the layout
)

# Enable specific Pyvis UI options for physics manipulation
nt.show_buttons(filter_=['physics'])

# Save the visualized graph as an HTML file
nt.save_graph("graph.html")

## **Customize html**

In [11]:
# Function to create an HTML div element for a skill with its associated color
def create_div(skill, color):
    div = '''
        <div style="display: flex; align-items: center; gap: 5px;">
            <div style="width: 10px; height: 10px; background-color: [!color]; display: inline-block;"></div>
            <span>[!skill]</span>
        </div>'''
    # Replace placeholders with actual skill and color values
    div = div.replace('[!skill]', skill).replace('[!color]', color)
    return div

# Open the existing graph HTML file for reading
with open('graph.html', 'r') as file:
    html_content = file.read()

# Insert a container div right after the opening <body> tag
html_content = html_content[:len('<body>') + html_content.index('<body>') + 2] + \
    '''<div style="display: flex; align-items: center; gap: 10px;"> </div>''' + \
    html_content[len('<body>') + html_content.index('<body>') + 2:]

# Find the position in the HTML where the skills divs will be inserted
index = html_content.index('gap: 10px;"> </div>') + len('gap: 10px;"> ')

# Iterate through skills and their assigned colors to generate and insert divs
for x in skills_colors:
    div = create_div(x, skills_colors[x])  # Generate the HTML div for the skill
    # Insert the div at the specified index in the HTML content
    html_content = html_content[:index] + ' ' + div + html_content[index:]

# Write the modified HTML content back to the file
with open('graph.html', 'w') as file:
    file.write(html_content)

## **exporting data**

In [12]:
# Initialize an empty DataFrame to store the output
output = pd.DataFrame()

# Iterate through each unique project title in the data
for x in list(data['project_title'].unique()):
    # Get the list of skills required for the current project
    # Use `query` to filter rows for the current project, sample one row, and extract the skills
    skills = ast.literal_eval(data.query(f"project_title == '{x}'").sample()['project_skills_required'].values[0])

    # Iterate through each skill in the required skills list
    for skill in skills:
        # Filter data for the current project and sort contributors by their skill's average similarity score
        for _, row in data[['project_title', 'name', skill + '_avg']] \
            .groupby('project_title') \
            .get_group(x) \
            .sort_values(by=skill + '_avg', ascending=False) \
            .head(3).iterrows():  # Take the top 3 contributors for the skill

            # Create a new DataFrame with the selected contributors' details and append to the output DataFrame
            output = pd.concat([output, pd.DataFrame(
                {
                    'project_title': [row['project_title']],  # Project title
                    'name': [row['name']],  # Contributor name
                    'skill': [skill],  # Skill being considered
                    'score': [row[skill + '_avg']]  # Average similarity score for the skill
                }
            )], ignore_index=True)


# **üìä Data Visualization**

In [13]:
IPython.display.HTML(filename="graph.html") # display Interactive HtmlGraph

# **üöÄ Interactive Widgets**

In [14]:
# Create a dropdown widget for selecting skills
skill_widget = widgets.Dropdown(
    options=output['skill'].unique(),  # Populate the dropdown with unique skills from 'output' DataFrame
    value=output['skill'].unique()[0],  # Default value is the first skill in the list
    description="skill",  # Label for the widget
)

# Create a dropdown widget for selecting projects
project_widget = widgets.Dropdown(
    options=output['project_title'].unique(),  # Populate the dropdown with unique project titles from 'output' DataFrame
    value=output['project_title'].unique()[0],  # Default value is the first project title in the list
    description="Project",  # Label for the widget
)

# Function to update the skill dropdown based on the selected project
def update_dropdown2(change):
    selected_category = change['new']  # Get the new value selected in the 'project_widget'

    # If a project is selected, update the skill dropdown to only show skills related to the selected project
    if selected_category:
        skill_widget.options = output.query(f"project_title == '{selected_category}'")['skill'].unique()
        skill_widget.value = None  # Reset the skill selection
    else:
        skill_widget.options = []  # If no project is selected, clear the skill options

# Attach the update function to the project dropdown (triggered when the 'value' of project_widget changes)
project_widget.observe(update_dropdown2, names='value')

# Function to filter and display the dataframe based on selected project and skill
def show_df(project=project_widget, skill=skill_widget):
    # Filter the 'output' DataFrame to show rows matching the selected project and skill
    df_filtered = output.query(f"project_title == '{project}' and skill == '{skill}'")
    display(df_filtered)  # Display the filtered DataFrame

# Create an interactive widget for displaying the filtered dataframe
my_gadget = widgets.interact(show_df)


interactive(children=(Dropdown(description='Project', options=('AI-Powered Chatbot for Customer Support', 'Ima‚Ä¶