# Some utilities functions to plot data

In [None]:
# Here, there are defined some functions to be used to plot different perspective of data.

In [3]:
import matplotlib.pyplot as plt

In [1]:
# Percentage of languages spoken by candidates

LANG_EN='English'
LANG_IT='Italian'
LANG_FR='French'
LANG_SP='Spanish'
LANG_DE='German'
LANG_CH='Chinese'
LANG_JP='Japanese'
LANG_IN='Indian'
LANG_RU='Russian'

LANG_OT='Other'


def plot_perc_lang_spoken(df):
    lang_count=[
               {"lang":LANG_EN, "count":len(df[(df["LANG_EN"]==1)])},
               {"lang":LANG_IT, "count":len(df[(df["LANG_IT"]==1)])},
               {"lang":LANG_FR, "count":len(df[(df["LANG_FR"]==1)])},
               {"lang":LANG_SP, "count":len(df[(df["LANG_SP"]==1)])},
               {"lang":LANG_DE, "count":len(df[(df["LANG_DE"]==1)])},
               {"lang":LANG_CH, "count":len(df[(df["LANG_CH"]==1)])},
               {"lang":LANG_JP, "count":len(df[(df["LANG_JP"]==1)])},
               {"lang":LANG_IN, "count":len(df[(df["LANG_IN"]==1)])},
               {"lang":LANG_RU, "count":len(df[(df["LANG_RU"]==1)])},
               {"lang":LANG_OT, "count":len(df[(df["LANG_OT"]==1)])}
    ]

    sorted_lang_count = sorted(lang_count, key=lambda x: x["count"],reverse=True)

    labels = [d['lang'] for d in sorted_lang_count]
    sizes = [d['count'] for d in sorted_lang_count]
    explode = (0, 0,0,0,0.1,0.15,0.2,0.3,0.4,0.5)  

    fig, ax = plt.subplots(figsize=(10, 8))
    #ax.pie(sizes, labels=labels, autopct='%1.1f%%')
    #explode=explode,
    ax.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=False,explode=explode, startangle=90,textprops={'fontsize':10,'fontweight':'bold'},
          )
    plt.legend(
        loc='upper left',
        labels=['%s, %1.1f%%' % (
            l, (float(s) / sum(sizes)) * 100) for l, s in zip(labels, sizes)],
        prop={'size': 12},
        bbox_to_anchor=(0.0, 1),
        bbox_transform=fig.transFigure
    )
    plt.tight_layout()
    plt.figure(figsize=(40, 40))
    plt.show()


In [2]:
# Horizontal bar chart to underline the different skills owned by candidates.
def plotHorizontalPlotHardSkills(title,skills,values):
    
    # Figure Size
    fig, ax = plt.subplots(figsize =(16, 9),)

    # Horizontal Bar Plot
    ax.barh(skills, values)

    # Remove axes splines
    for s in ['top', 'bottom', 'left', 'right']:
        ax.spines[s].set_visible(False)

    # Remove x, y Ticks
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')

    # Add padding between axes and labels
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)

    # Add x, y gridlines
    ax.grid(color ='black',
            linestyle ='-.', linewidth = 0.8,
            alpha = 0.2)

    # Show top values
    ax.invert_yaxis()

    # Add annotation to bars
    for i in ax.patches:
        plt.text(i.get_width()+0.2, i.get_y()+0.5,
                 str(round((i.get_width()), 2)),
                 fontsize = 14, fontweight ='bold',
                 color ='black')

    # Add Plot Title
    ax.set_title(title,loc ='left', )

    # Add Text watermark
    fig.text(0.9, 0.15, title, fontsize = 16,
             color ='black', ha ='right', va ='top',
             alpha = 0.7)

    # Show Plot
    plt.show()        

In [None]:
#Prepering HorizontalPlotHardSkills for particular skills of IT profiles.
def plot_skills(df):
    skills = ["Back-End","Front-End","FrameWork-Middle","O.S.","Database","AI","Network","General IT"]

    back_end_skills_count=df[(df["cpp"]==1) | (df[".net"]==1) |  (df["python"]==1) | 
              (df["java"]==1) | (df["ruby"]==1) | (df["react"]==1) | (df["php"]==1)]["full_name"].count()
    
    front_end_skills_count=df[(df["html"]==1) | (df["css"]==1) |  (df["javascript"]==1)]["full_name"].count()

    framework_middleware_skills_count=df[(df["angular"]==1) | (df["rabbitmq"]==1) |  (df["kubernetes"]==1)]["full_name"].count()

    operating_system_skills_count=df[(df["windows"]==1) | (df["linux"]==1) | (df["android"]==1) | (df["apple"]==1)]["full_name"].count()

    database_skills_count=df[(df["sql"]==1) | (df["nosql"]==1) | (df["mongodb"]==1) | (df["oracle"]==1) | (df["neo4j"]==1)]["full_name"].count()
    
    artificial_intelligence_skills_count=df[(df["ai"]==1) | (df["tensorflow"]==1) | (df["nlp"]==1)]["full_name"].count()

    network_skills_count=df[(df["networking"]==1) | (df["security"]==1) | (df["cloud"]==1)]["full_name"].count()

    general_it_skilss_count=df[(df["test"]==1) | (df["team"]==1)]["full_name"].count()


    values = [back_end_skills_count,
              front_end_skills_count,
              framework_middleware_skills_count,
              operating_system_skills_count,
              database_skills_count,
              artificial_intelligence_skills_count,
              network_skills_count,
              general_it_skilss_count
             ]
    
    plotHorizontalPlotHardSkills('Skills Profiles',skills,values)


In [3]:
#Drawing a 2D scatter plot of a dataframe
def drawScatterPlot(dataDim1,dataDim2,titlePlot,titleX,titleY):
    ax = plt.axes()
    ax.set_facecolor("#e3e3e3")
    plt.scatter(x=dataDim1, y=dataDim2,edgecolors='black')
    plt.title(titlePlot)
    plt.xlabel(titleX,fontweight='bold')
    plt.ylabel(titleY,fontweight='bold')
    plt.show()

In [1]:
#Plotting Cluster using scatter plot
def plot_clusters(data,labels,centroids,title,titleX,titleY):
    
    ax = plt.axes()
    plt.xlabel(titleX, fontweight='bold')
    plt.ylabel(titleY, fontweight='bold')
    plt.title(title)
    
    
    #colors = {-1: '#440154', 1: '#fde725', 2: '#21918c', 3:'#63d1d1': 4:'#e9982f', 5:'#07f43e',6:'#c36116',7:'#5c10bf',8:'#3bf5ff',9:'#b925f4',10:'#2214e5'}
    #labels.map(colors)
    ax.set_facecolor("#e3e3e3")
    scatter=plt.scatter(x=data.iloc[:, 0], y=data.iloc[:, 1], c=labels,s=50,edgecolors='black')
    if (centroids is not None and centroids.size > 0):
        plt.scatter(x=centroids[:,0], y=centroids[:,1],c ="red",s=75,edgecolors='black')
    legend_elements = plt.legend(*scatter.legend_elements(), title="Clusters",loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

In [None]:
def plotPercentagesLanguagesForEachCluster(df,raggr_name):
    
    df.groupby(raggr_name).sum()[["LANG_EN","LANG_IN","LANG_OT","LANG_SP","LANG_FR","LANG_DE","LANG_CH"]].plot(kind="bar",figsize=(15, 8))

    grouped_df=df.groupby(raggr_name).sum()[["Total","LANG_EN","LANG_IN","LANG_OT","LANG_SP","LANG_FR","LANG_DE","LANG_CH"]]
    grouped_df["LANG_EN%"]=((grouped_df["LANG_EN"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_IN%"]=((grouped_df["LANG_IN"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_OT%"]=((grouped_df["LANG_OT"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_SP%"]=((grouped_df["LANG_SP"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_FR%"]=((grouped_df["LANG_FR"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_DE%"]=((grouped_df["LANG_DE"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["LANG_CH%"]=((grouped_df["LANG_CH"]*100)/grouped_df["Total"]).round(decimals=2)

    return grouped_df[["Total","LANG_EN%","LANG_IN%","LANG_OT%","LANG_SP%","LANG_FR%","LANG_DE%","LANG_CH%"]]

In [None]:
def plotPercentagesSkillsForEachCluster(df,raggr_name):

    df.groupby(raggr_name).sum()[["Backend","Frontend","MiddleWare","OS","Database","AI","Networks","General"]].plot(kind="bar",figsize=(15, 8))

    grouped_df=df.groupby(raggr_name).sum()[["Total","Backend","Frontend","MiddleWare","OS","Database","AI","Networks","General"]]

    grouped_df["Frontend%"]=((grouped_df["Frontend"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["MiddleWare%"]=((grouped_df["MiddleWare"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["OS%"]=((grouped_df["OS"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["Database%"]=((grouped_df["Database"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["AI%"]=((grouped_df["AI"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["Networks%"]=((grouped_df["Networks"]*100)/grouped_df["Total"]).round(decimals=2)
    grouped_df["General%"]=((grouped_df["General"]*100)/grouped_df["Total"]).round(decimals=2)

    return grouped_df[["Total","Frontend%","MiddleWare%","OS%","Database%","AI%","Networks%","General%"]]