In [19]:
%pip install seaborn

/Users/chandraprakashbathula/.zshenv:.:1: no such file or directory: /Users/chandraprakashbathula/.cargo/env

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [24]:
import pandas as pd
import numpy as np
from collections import defaultdict

def analyze_common_tools():
    # File names
    files = [
        'NetworkandComputerSystemsAdministrators.csv',
        'ComputerProgrammers.csv',
        'ComputerNetworkArchitects.csv',
        'ComputerandInformationSystemsManagers.csv'
    ]
    
    # Dictionary to store tools and their occurrences
    tool_occurrences = defaultdict(set)
    job_roles = []
    
    # Process each file
    for file in files:
        job_role = file.replace('.csv', '')
        job_roles.append(job_role)
        
        df = pd.read_csv(file)
        for _, row in df.iterrows():
            tool_occurrences[row['Example']].add(job_role)
    
    # Create a list of tools that appear in multiple roles
    common_tools = []
    for tool, roles in tool_occurrences.items():
        if len(roles) > 1:  # Tool appears in more than one role
            common_tools.append({
                'Tool': tool,
                'Frequency': len(roles),
                'Roles': '; '.join(roles)
            })
    
    # Convert to DataFrame and sort by frequency
    common_tools_df = pd.DataFrame(common_tools)
    common_tools_df = common_tools_df.sort_values('Frequency', ascending=False)
    
    return common_tools_df

# Execute analysis
results_df = analyze_common_tools()
print(f"Found {len(results_df)} common tools")
print("\nTop 10 most common tools:")
print(results_df.head(10))

# Save results for visualization
results_df.to_csv('common_tools.csv', index=False)

Found 244 common tools

Top 10 most common tools:
                                                Tool  Frequency  \
122                         Quest Erwin Data Modeler          4   
149                                               C#          4   
81                            Microsoft Visual Basic          4   
79                          Microsoft Azure software          4   
78                          Microsoft .NET Framework          4   
77   Integrated development environment IDE software          4   
198                                    Apache Tomcat          4   
75                                       Eclipse IDE          4   
74           Common business oriented language COBOL          4   
73                                                 C          4   

                                                                                                                              Roles  
122  NetworkandComputerSystemsAdministrators; ComputerandInformationSystemsMan

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

def analyze_and_visualize_tools():
    # File names
    files = [
        'NetworkandComputerSystemsAdministrators.csv',
        'ComputerProgrammers.csv',
        'ComputerNetworkArchitects.csv',
        'ComputerandInformationSystemsManagers.csv'
    ]
    
    # Dictionary to store tools and their occurrences
    tool_occurrences = defaultdict(set)
    category_occurrences = defaultdict(set)
    job_roles = []
    
    # Process each file
    for file in files:
        job_role = file.replace('.csv', '').replace('and', '&')
        job_roles.append(job_role)
        
        df = pd.read_csv(file)
        for _, row in df.iterrows():
            if pd.notna(row['Example']):  # Check if Example is not NaN
                tool_occurrences[row['Example']].add(job_role)
                category_occurrences[row['Category']].add(job_role)
    
    # Create DataFrames for analysis
    tools_data = []
    for tool, roles in tool_occurrences.items():
        tools_data.append({
            'Tool': tool,
            'Frequency': len(roles),
            'Roles': '; '.join(roles)
        })
    
    categories_data = []
    for category, roles in category_occurrences.items():
        categories_data.append({
            'Category': category,
            'Frequency': len(roles),
            'Roles': '; '.join(roles)
        })
    
    tools_df = pd.DataFrame(tools_data)
    categories_df = pd.DataFrame(categories_data)
    
    # Sort by frequency
    tools_df = tools_df.sort_values('Frequency', ascending=False)
    categories_df = categories_df.sort_values('Frequency', ascending=False)
    
    # Create visualizations
    plt.rcParams['figure.figsize'] = [15, 10]
    
    # 1. Scatter plot of tools
    plt.figure()
    plt.subplot(2, 2, 1)
    plt.scatter(tools_df['Frequency'], 
                np.random.normal(0, 0.1, size=len(tools_df)) + tools_df['Frequency'],
                alpha=0.5,
                c='blue')
    plt.title('Tool Distribution Across Roles', pad=20)
    plt.xlabel('Number of Roles Using Tool')
    plt.ylabel('Distribution')
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 2. Category frequency bar plot
    plt.subplot(2, 2, 2)
    top_categories = categories_df.head(10)
    bars = plt.barh(range(len(top_categories)), top_categories['Frequency'], color='skyblue')
    plt.yticks(range(len(top_categories)), top_categories['Category'], fontsize=8)
    plt.title('Top 10 Most Common Categories', pad=20)
    plt.xlabel('Number of Roles')
    
    # Add value labels on the bars
    for bar in bars:
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2, 
                f'{int(width)}', 
                ha='left', va='center', fontsize=8)
    
    # 3. Role overlap matrix
    role_matrix = np.zeros((len(job_roles), len(job_roles)))
    for roles in tool_occurrences.values():
        roles = list(roles)
        for i, role1 in enumerate(job_roles):
            for j, role2 in enumerate(job_roles):
                if role1 in roles and role2 in roles:
                    role_matrix[i][j] += 1
    
    plt.subplot(2, 2, 3)
    im = plt.imshow(role_matrix, cmap='YlOrRd')
    plt.colorbar(im)
    
    # Add text annotations to the heatmap
    for i in range(len(job_roles)):
        for j in range(len(job_roles)):
            plt.text(j, i, int(role_matrix[i, j]),
                    ha='center', va='center',
                    color='black' if role_matrix[i, j] < role_matrix.max()/2 else 'white')
    
    plt.xticks(range(len(job_roles)), [r.split('Computer')[0] for r in job_roles], 
               rotation=45, ha='right')
    plt.yticks(range(len(job_roles)), [r.split('Computer')[0] for r in job_roles])
    plt.title('Tool Sharing Between Roles', pad=20)
    
    # 4. Tool frequency distribution
    plt.subplot(2, 2, 4)
    n, bins, patches = plt.hist(tools_df['Frequency'], bins=range(1, 6), 
                               align='left', rwidth=0.8, color='skyblue')
    plt.title('Distribution of Tool Usage', pad=20)
    plt.xlabel('Number of Roles Using Tool')
    plt.ylabel('Number of Tools')
    
    # Add value labels on top of bars
    for i in range(len(n)):
        plt.text(bins[i], n[i], int(n[i]), ha='center', va='bottom')
    
    plt.tight_layout(pad=3.0)
    plt.savefig('tool_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Create detailed visualization for top tools
    plt.figure(figsize=(15, 8))
    top_tools = tools_df[tools_df['Frequency'] > 2].head(20)
    
    bars = plt.barh(range(len(top_tools)), top_tools['Frequency'], color='skyblue')
    plt.yticks(range(len(top_tools)), top_tools['Tool'], fontsize=8)
    plt.title('Top Tools Used Across Multiple Roles', pad=20)
    plt.xlabel('Number of Roles')
    
    # Add value labels on the bars
    for bar in bars:
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2, 
                f'{int(width)}', 
                ha='left', va='center')
    
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('top_tools.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return tools_df, categories_df

# Execute analysis and create visualizations
tools_df, categories_df = analyze_and_visualize_tools()

print("\nTop 10 most common tools:")
print(tools_df.head(10)[['Tool', 'Frequency', 'Roles']])
print("\nTop 10 most common categories:")
print(categories_df.head(10)[['Category', 'Frequency']])


Top 10 most common tools:
                               Tool  Frequency  \
256                         Node.js          4   
124      IBM Power Systems software          4   
249  Hypertext markup language HTML          4   
117      Oracle Fusion Applications          4   
114              Microsoft Dynamics          4   
250                      JavaScript          4   
111        Oracle Fusion Middleware          4   
252                      LAMP Stack          4   
109        IBM InfoSphere DataStage          4   
108  Extensible markup language XML          4   

                                                                                                                          Roles  
256  Computer&InformationSystemsManagers; ComputerNetworkArchitects; ComputerProgrammers; Network&ComputerSystemsAdministrators  
124  Computer&InformationSystemsManagers; ComputerNetworkArchitects; ComputerProgrammers; Network&ComputerSystemsAdministrators  
249  Computer&InformationSystems

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage

def create_advanced_visualizations():
    # Read and process files
    files = [
        'NetworkandComputerSystemsAdministrators.csv',
        'ComputerProgrammers.csv',
        'ComputerNetworkArchitects.csv',
        'ComputerandInformationSystemsManagers.csv'
    ]
    
    # Create a matrix of tools vs roles
    tools_by_role = {}
    categories_by_role = {}
    all_tools = set()
    all_categories = set()
    
    for file in files:
        role = file.replace('.csv', '').split('Computer')[0].strip()
        df = pd.read_csv(file)
        
        tools = set(df['Example'].dropna())
        categories = set(df['Category'].dropna())
        
        all_tools.update(tools)
        all_categories.update(categories)
        
        tools_by_role[role] = tools
        categories_by_role[role] = categories
    
    # Create binary matrices
    roles = list(tools_by_role.keys())
    tools_list = sorted(list(all_tools))
    categories_list = sorted(list(all_categories))
    
    tool_matrix = pd.DataFrame(0, index=tools_list, columns=roles)
    category_matrix = pd.DataFrame(0, index=categories_list, columns=roles)
    
    # Fill matrices
    for role in roles:
        for tool in tools_by_role[role]:
            tool_matrix.loc[tool, role] = 1
        for category in categories_by_role[role]:
            category_matrix.loc[category, role] = 1
    
    # 1. Tool Usage Heat Map with top tools
    top_tools = tool_matrix.sum(axis=1).sort_values(ascending=False).head(30).index
    plt.figure(figsize=(12, 16))
    plt.imshow(tool_matrix.loc[top_tools], aspect='auto', cmap='YlOrRd')
    plt.colorbar(label='Present in Role')
    plt.xticks(range(len(roles)), roles, rotation=45, ha='right')
    plt.yticks(range(len(top_tools)), top_tools, fontsize=8)
    plt.title('Tool Usage Across Roles (Top 30 Tools)')
    plt.tight_layout()
    plt.savefig('tool_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Category Heat Map
    plt.figure(figsize=(12, 16))
    plt.imshow(category_matrix, aspect='auto', cmap='YlOrRd')
    plt.colorbar(label='Present in Role')
    plt.xticks(range(len(roles)), roles, rotation=45, ha='right')
    plt.yticks(range(len(categories_list)), categories_list, fontsize=8)
    plt.title('Category Usage Across Roles')
    plt.tight_layout()
    plt.savefig('category_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Tool Co-occurrence Matrix for top tools
    tool_cooccurrence = tool_matrix.T.dot(tool_matrix)
    top_20_tools = tool_matrix.sum(axis=1).sort_values(ascending=False).head(20).index
    
    plt.figure(figsize=(15, 15))
    plt.imshow(tool_cooccurrence.loc[top_20_tools, top_20_tools], cmap='viridis')
    plt.colorbar(label='Co-occurrence Count')
    plt.xticks(range(len(top_20_tools)), top_20_tools, rotation=90, fontsize=8)
    plt.yticks(range(len(top_20_tools)), top_20_tools, fontsize=8)
    plt.title('Tool Co-occurrence Matrix (Top 20 Tools)')
    plt.tight_layout()
    plt.savefig('tool_cooccurrence.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Scatter plot of tool prevalence vs category size
    tool_counts = tool_matrix.sum(axis=1)
    category_sizes = {}
    category_avg_prevalence = {}
    
    # Read one file to get category-tool relationships
    df = pd.read_csv(files[0])
    for category in all_categories:
        tools_in_category = df[df['Category'] == category]['Example'].dropna()
        if len(tools_in_category) > 0:
            category_sizes[category] = len(tools_in_category)
            category_avg_prevalence[category] = tool_counts[tools_in_category].mean()
    
    plt.figure(figsize=(15, 10))
    sizes = list(category_sizes.values())
    prevalences = list(category_avg_prevalence.values())
    
    plt.scatter(sizes, prevalences, alpha=0.6, s=100)
    
    # Annotate points
    for category, size in category_sizes.items():
        prevalence = category_avg_prevalence[category]
        if size > np.mean(sizes) or prevalence > np.mean(prevalences):
            plt.annotate(category, (size, prevalence), 
                        fontsize=8, alpha=0.7,
                        xytext=(5, 5), textcoords='offset points')
    
    plt.xlabel('Number of Tools in Category')
    plt.ylabel('Average Tool Usage Across Roles')
    plt.title('Category Analysis: Size vs Average Usage')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('category_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 5. Role Similarity Matrix
    role_similarity = tool_matrix.T.dot(tool_matrix)
    plt.figure(figsize=(10, 10))
    plt.imshow(role_similarity, cmap='YlOrRd')
    plt.colorbar(label='Number of Shared Tools')
    plt.xticks(range(len(roles)), roles, rotation=45, ha='right')
    plt.yticks(range(len(roles)), roles)
    plt.title('Role Similarity Based on Shared Tools')
    plt.tight_layout()
    plt.savefig('role_similarity.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 6. Tool Distribution Violin Plot
    plt.figure(figsize=(12, 6))
    tool_counts_by_role = tool_matrix.sum(axis=0)
    plt.violinplot([tool_matrix[role] for role in roles])
    plt.xticks(range(1, len(roles) + 1), roles, rotation=45, ha='right')
    plt.ylabel('Tool Usage (0 = Not Used, 1 = Used)')
    plt.title('Distribution of Tool Usage Across Roles')
    plt.tight_layout()
    plt.savefig('tool_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return tool_matrix, category_matrix

# Execute and create visualizations
tool_matrix, category_matrix = create_advanced_visualizations()

print("Visualizations created:")
print("1. tool_heatmap.png - Heat map showing tool usage across roles")
print("2. category_heatmap.png - Heat map showing category usage across roles")
print("3. tool_cooccurrence.png - Tool co-occurrence matrix for top 20 tools")
print("4. category_analysis.png - Scatter plot of category size vs average usage")
print("5. role_similarity.png - Heat map showing role similarities")
print("6. tool_distribution.png - Violin plot showing tool usage distribution")

KeyError: "None of [Index(['Ruby on Rails', 'Microsoft SQL Server Reporting Services SSRS',\n       'Microsoft Visio', 'IBM Cognos Impromptu', 'NetSuite ERP',\n       'Microsoft SharePoint', 'IBM Domino', 'Puppet', 'Python',\n       'IBM InfoSphere DataStage', 'C++', 'C#', 'C', 'Qlik Tech QlikView',\n       'Quest Erwin Data Modeler', 'Hypertext markup language HTML',\n       'Enterprise JavaBeans', 'IBM Power Systems software',\n       'Blackboard software', 'Blackbaud The Raiser's Edge'],\n      dtype='object')] are in the [index]"

<Figure size 1500x1500 with 0 Axes>

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

def analyze_job_roles():
    # Read files
    files = [
        'NetworkandComputerSystemsAdministrators.csv',
        'ComputerProgrammers.csv',
        'ComputerNetworkArchitects.csv',
        'ComputerandInformationSystemsManagers.csv'
    ]
    
    # Data structures to store analysis
    all_tools = defaultdict(list)
    all_categories = defaultdict(list)
    role_data = {}
    
    # Process each file
    for file in files:
        role = file.replace('.csv', '').split('Computer')[0].strip()
        df = pd.read_csv(file)
        
        # Store processed data for each role
        role_data[role] = {
            'tools': df['Example'].dropna().tolist(),
            'categories': df['Category'].dropna().unique().tolist(),
            'raw_data': df
        }
        
        # Build tool and category frequency data
        for tool in role_data[role]['tools']:
            all_tools[tool].append(role)
        for category in role_data[role]['categories']:
            all_categories[category].append(role)
    
    # Convert to DataFrames for analysis
    tools_df = pd.DataFrame([
        {'Tool': tool, 'Frequency': len(roles), 'Roles': '; '.join(roles)}
        for tool, roles in all_tools.items()
    ]).sort_values('Frequency', ascending=False)
    
    categories_df = pd.DataFrame([
        {'Category': cat, 'Frequency': len(roles), 'Roles': '; '.join(roles)}
        for cat, roles in all_categories.items()
    ]).sort_values('Frequency', ascending=False)
    
    return tools_df, categories_df, role_data

def create_visualizations(tools_df, categories_df, role_data):
    # Set the style for all plots
    plt.style.use('seaborn')
    
    # 1. Tool Distribution Scatter Plot
    plt.figure(figsize=(15, 8))
    plt.scatter(range(len(tools_df)), tools_df['Frequency'], 
                alpha=0.5, c='blue', label='Tools')
    
    # Annotate top tools
    for i in range(10):
        plt.annotate(tools_df.iloc[i]['Tool'],
                    (i, tools_df.iloc[i]['Frequency']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8)
    
    plt.title('Distribution of Tools Across Roles')
    plt.xlabel('Tool Rank')
    plt.ylabel('Number of Roles Using Tool')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()
    
    # 2. Category Distribution Scatter Plot
    plt.figure(figsize=(15, 8))
    plt.scatter(range(len(categories_df)), categories_df['Frequency'],
                alpha=0.5, c='green', label='Categories')
    
    # Annotate top categories
    for i in range(10):
        plt.annotate(categories_df.iloc[i]['Category'],
                    (i, categories_df.iloc[i]['Frequency']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8)
    
    plt.title('Distribution of Categories Across Roles')
    plt.xlabel('Category Rank')
    plt.ylabel('Number of Roles Using Category')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()
    
    # 3. Role Similarity Heatmap
    roles = list(role_data.keys())
    similarity_matrix = np.zeros((len(roles), len(roles)))
    
    for i, role1 in enumerate(roles):
        for j, role2 in enumerate(roles):
            tools1 = set(role_data[role1]['tools'])
            tools2 = set(role_data[role2]['tools'])
            similarity_matrix[i, j] = len(tools1.intersection(tools2))
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(similarity_matrix, 
                annot=True, 
                fmt='d',
                xticklabels=roles,
                yticklabels=roles,
                cmap='YlOrRd')
    plt.title('Number of Shared Tools Between Roles')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # 4. Tools per Category Distribution
    category_tool_counts = {}
    role_usage = {}
    
    for role, data in role_data.items():
        df = data['raw_data']
        for category in data['categories']:
            tools_in_cat = df[df['Category'] == category]['Example'].dropna()
            if category not in category_tool_counts:
                category_tool_counts[category] = len(tools_in_cat)
                role_usage[category] = 1
            else:
                role_usage[category] += 1
    
    plt.figure(figsize=(15, 8))
    plt.scatter(list(category_tool_counts.values()),
                list(role_usage.values()),
                alpha=0.6)
    
    # Annotate interesting points
    for category, tools in category_tool_counts.items():
        roles = role_usage[category]
        if tools > np.mean(list(category_tool_counts.values())) or \
           roles > np.mean(list(role_usage.values())):
            plt.annotate(category,
                        (tools, roles),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=8)
    
    plt.xlabel('Number of Tools in Category')
    plt.ylabel('Number of Roles Using Category')
    plt.title('Category Analysis: Tool Count vs Role Coverage')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # 5. Top Tools Bar Plot
    plt.figure(figsize=(15, 8))
    top_n = 20
    plt.bar(range(top_n), tools_df['Frequency'].head(top_n), alpha=0.6)
    plt.xticks(range(top_n), tools_df['Tool'].head(top_n), rotation=45, ha='right')
    plt.title(f'Top {top_n} Most Common Tools')
    plt.xlabel('Tool Name')
    plt.ylabel('Number of Roles Using Tool')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return tools_df.head(10), categories_df.head(10)

# Run the analysis
tools_df, categories_df, role_data = analyze_job_roles()
top_tools, top_categories = create_visualizations(tools_df, categories_df, role_data)

print("\nTop 10 Most Common Tools:")
print(top_tools[['Tool', 'Frequency', 'Roles']])
print("\nTop 10 Most Common Categories:")
print(top_categories[['Category', 'Frequency', 'Roles']])

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)