# Visualize Results

In [1]:
import math
import random

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.interpolate import splrep, splev
%matplotlib inline

## 1. Plot Contributors
Input: '/data2/zihe/data/OSS-census/contributor_by_win/*.csv' (copied to './contributor/data/')<br>
Output:
- All active contributor by gender by window: './contributor/all/'
- Core active contributor by gender by window: './contributor/core/'

In [66]:
def plot_contributors(lang):
    
    # Import data
    dat = pd.read_csv('./contributor/data/'+lang+'.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)
    max_win = 45
    dat = dat[dat['win']<=max_win]
    
    # Change window to date
    wins = dat["win"]
    new_wins = []
    for win in wins:
        time = 3 * win
        year = 2008 + math.floor(time/12)
        month = time - math.floor(time/12)*12
        if not month:
            month = 12
        if month == 3:
            new_wins.append("{}".format(year))
        else:
            new_wins.append("{}-{}".format(year,month))
    wins = new_wins
    
    # Plot information
    fig,ax = plt.subplots()
    title = lang
    if lang == "C#":
        title = "C\#"
    ax.set_title("Active Contributors in " + r"$\bf{" + title + "}$" +" Public Projects", fontsize=18)
    ax.set_xlabel('Time (quarter)', fontsize=16)
    ax.set_ylabel('Number of Contributors (thousand)', fontsize=16, labelpad=5.0)
    
    # Plot contributor number bar
    female = dat["female_all"] / 1000
    male = (dat["female_all"] + dat["male_all"]) / 1000
    unknown = (dat["all_all"]) / 1000
    ax.bar(wins, unknown, label = "All unknown", color = "blanchedalmond")
    ax.bar(wins, male, label = "All male", color = "#f29d4b")
    ax.bar(wins, female, label = "All female", color = "#de2d26")
    plt.legend(loc=(0.02, 0.80),fontsize=12, frameon=False)

    # Plot ratio line for female in all contributor
    ax2=ax.twinx()
    male = dat["male_all"].replace(0, 1)
    ratio = dat["female_all"] / (dat["female_all"] + male)
    for win in dat["win"]:
        win = win - 1
        if dat["female_all"][win] + male[win] <= 5:
            ratio[win] = 0
    ax2.plot(wins, ratio,color="darkblue",marker="o", markerfacecolor='white', markeredgecolor='darkblue', markeredgewidth=1, label="Among all")
    ax2.set_ylabel('Female Ratio = F/(F+M)', fontsize=16, labelpad=25.0).set_rotation(270)
    last_all = ratio[max_win-1]
    
    # Plot ratio line for female in core contributor
    male = dat["male_core"].replace(0, 1)
    ratio = dat["female_core"] / (dat["female_core"] + male)
    for win in dat["win"]:
        win = win - 1
        if dat["female_core"][win] + male[win] <= 10:
            ratio[win] = 0
    ax2.plot(wins, ratio,color="darkblue",marker="s", markeredgecolor='darkblue', markeredgewidth=1, label="Among core")
    last_core = ratio[max_win-1]
    #plt.figtext(0.1, 0.02,"* Female ratio gap in the last window is " + (str(round((last_all-last_core)*100, 2))+"0")[0:4] + "%.")
    
    # Set x ticks
    ax.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    ax2.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    
    # Set y ticks
    upper = math.ceil(max(dat["all_all"])/60000)*60
    ax.set_yticks(np.arange(0, upper * 7 / 6, upper/6))
    ax.set_ylim(ymin = 0, ymax = upper)
    ax2.set_yticks(np.arange(0, 0.19, 0.03))
    ax2.set_ylim(ymin = 0, ymax = 0.18)
    
    # No Frame
    ax.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax2.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    #ax.grid(axis='y')
    #ax2.grid(axis='y')
    
    # Set Size
    plt.legend(loc=(0.75, 0.85),fontsize=12, frameon=True,edgecolor="white")
    plt.rcParams["figure.figsize"] = (10,7)
    
    # Save figure
    plt.savefig("./contributor/graph/"+lang+"_contributor.pdf", facecolor='white', transparent=False)
    plt.savefig("./contributor/graph/"+lang+"_contributor.png", facecolor='white', transparent=False)
    #plt.show()
    plt.clf()

#plot_contributors("HTML")

In [67]:
langs = ["JavaScript", "Python", "Java", "Go", "Ruby", "C++", "TypeScript", 
          "PHP", "C#", "C", "HTML", "CSS", "Jupyter", "Shell", "Objective-C", "All"]
for lang in langs:
    plot_contributors(lang)

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

## 2. Plot Commits
Input: '/data2/zihe/data/OSS-census/commit_by_win/*.csv' (copied to './commit/data/')<br>
Output: Commit count by gender by window: './commit/graph'

In [68]:
def plot_commits(lang):
    
    # Import data
    dat = pd.read_csv('./commit/data/'+lang+'.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)
    max_win = 45
    dat = dat[dat['win']<=max_win]
    
    # Change window to date
    wins = dat["win"]
    new_wins = []
    for win in wins:
        time = 3 * win
        year = 2008 + math.floor(time/12)
        month = time - math.floor(time/12)*12
        if not month:
            month = 12
        if month == 3:
            new_wins.append("{}".format(year))
        else:
            new_wins.append("{}-{}".format(year,month))
    wins = new_wins
    
    # Plot information
    fig,ax = plt.subplots()
    title = lang
    if lang == "C#":
        title = "C\#"
    ax.set_title("Commits in " + r"$\bf{" + title + "}$" +" Public Projects", fontsize=18)
    ax.set_xlabel('Time (quarter)', fontsize=16)
    ax.set_ylabel('Commit Numbers (thousand)', fontsize=16, labelpad=5.0)
    
    # Plot commit number bar
    female = dat["female_commit"] / 1000
    male = (dat["female_commit"] + dat["male_commit"]) / 1000
    unknown = dat["all_commit"] / 1000
    ax.bar(wins, unknown, label = "unknown", color = "blanchedalmond")
    ax.bar(wins, male, label = "male", color = "#f29d4b")
    ax.bar(wins, female, label = "female", color = "#de2d26")
    plt.legend(loc=(0.02, 0.80),fontsize=12, frameon=False)
    
    # Plot ratio line for female in all commits
    ax2=ax.twinx()
    male = dat["male_commit"].replace(0, 1)
    ratio = dat["female_commit"] / (dat["female_commit"] + male)
    for win in dat["win"]:
        win = win - 1
        if dat["female_commit"][win] + male[win] <= 1000:
            ratio[win] = 0
    ax2.plot(wins, ratio,color="darkblue",marker="o", markerfacecolor='white', markeredgecolor='darkblue', markeredgewidth=1, label="female ratio")
    ax2.set_ylabel('Female Commit Ratio = F/(F+M)', fontsize=16, labelpad=25.0).set_rotation(270)
    
    # Set x ticks
    ax.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    ax2.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    
    # Set y ticks
    upper = math.ceil(max(dat["all_commit"])/600000)*600
    ax.set_yticks(np.arange(0, upper *7/6, upper/6))
    ax.set_ylim(ymin = 0, ymax = upper)
    ax2.set_yticks(np.arange(0, 0.36, 0.05))
    ax2.set_ylim(ymin = 0, ymax = 0.35)

    # No Frame
    ax.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax2.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    #ax.grid(axis='y')
    #ax2.grid(axis='y')
    
    # Set Size
    plt.legend(loc=(0.75, 0.85),fontsize=12, frameon=True, edgecolor="white")
    plt.rcParams["figure.figsize"] = (10,7)
    
    # Save figure
    plt.savefig("./commit/graph/"+lang+"_commit.pdf", facecolor='white', transparent=False)
    plt.savefig("./commit/graph/"+lang+"_commit.png", facecolor='white', transparent=False)
    #plt.show()
    plt.clf()
    
#plot_commits("JavaScript")

In [69]:
langs = ["JavaScript", "Python", "Java", "Go", "Ruby", "C++", "TypeScript", 
          "PHP", "C#", "C", "HTML", "CSS", "Jupyter", "Shell", "Objective-C", "All"]
for lang in langs:
    plot_commits(lang)

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

## 3. Plot Projects
Input: '/data2/zihe/data/OSS-census/proj_by_win/full.csv' (copied to './project/')<br>
Output: All active project count by window: './project/'

In [98]:
def proj_count(lang):
    dat = pd.read_csv('./project/data/full.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)
    max_win = 45
    dat = dat[dat['win']<=max_win]

    # Plot information
    fig,ax = plt.subplots()
    ax.set_title("Active Public Projects in " + r"$\bf{" + lang + "}$" +" Ecosystem", fontsize=18)
    ax.set_xlabel('Time (quarter)', fontsize=16)
    ax.set_ylabel('Active Public Project Numbers (thousand)', fontsize=16, labelpad=5.0)
    
    # Change window to date
    x = dat["win"]
    new_x = []
    for win in x:
        time = 3 * win
        year = 2008 + math.floor(time/12)
        month = time - math.floor(time/12)*12
        if not month:
            month = 12
        if month == 3:
            new_x.append("{}".format(year))
        else:
            new_x.append("{}-{}".format(year,month))
    x = new_x
    
    # Plot left y axis
    ax.bar(new_x, dat[lang+"_all"] / 1000, label = "All", color = "#f29d4b")
    ax.bar(new_x, dat[lang+"_fem"] / 1000, label = "Has female", color = "#de2d26")
    
    # Plot ratio line for female in all commits
    ax2=ax.twinx()
    ratio = dat[lang+"_fem"] / dat[lang+"_all"]
    for win in dat["win"]:
        win = win - 1
        if dat[lang+"_all"][win] <= 5:
            ratio[win] = 0
    ax2.plot(new_x, ratio,color="darkblue",marker="o", markerfacecolor='white', markeredgecolor='darkblue', markeredgewidth=1, label="female ratio")
    ax2.set_ylabel('Female Commit Ratio = F/(F+M)', fontsize=16, labelpad=25.0).set_rotation(270)
    
    # Set x ticks
    ax.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    ax2.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    
    # Set y ticks
    upper = math.ceil(max(dat[lang+"_all"])/50000)*50
    ax.set_yticks(np.arange(0, upper *6/5, upper/5))
    ax.set_ylim(ymin = 0, ymax = upper)
    ax2.set_yticks(np.arange(0, 0.36, 0.05))
    ax2.set_ylim(ymin = 0, ymax = 0.35)

    # No Frame
    ax.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax2.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    
    # Set Size
    plt.legend(loc="upper left",fontsize=12, frameon = False)
    plt.rcParams["figure.figsize"] = (10,7)
    
    plt.savefig("./project/graph/"+lang+"_project.pdf", facecolor='white', transparent=False)
    plt.savefig("./project/graph/"+lang+"_project.png", facecolor='white', transparent=False)
    #plt.show()
    plt.clf()

In [99]:
langs = ["JavaScript", "Python", "Java", "Go", "Ruby", "C++", "TypeScript", 
          "PHP", "C#", "C", "HTML", "CSS", "Jupyter", "Shell", "Objective-C", "All"]
for lang in langs:
    proj_count(lang)

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

In [101]:
dat = pd.read_csv('./project/data/proj_create_win.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)
max_win = 45
dat = dat[dat['win']<=max_win]
dat = dat[dat['win']>0]

# Plot information
fig,ax = plt.subplots()
ax.set_title("Number of Newly Created Projects on GHTorrent", fontsize=18)
ax.set_xlabel('Time (quarter)', fontsize=16)
ax.set_ylabel('Number of Projects', fontsize=16, labelpad=5.0)

# No Frame
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

# Change window to date
x = dat["win"]
new_x = []
for win in x:
    time = 3 * win
    year = 2008 + math.floor(time/12)
    month = time - math.floor(time/12)*12
    if not month:
        month = 12
    if month == 3:
        new_x.append("{}".format(year))
    else:
        new_x.append("{}-{}".format(year,month))
x = new_x

y = dat["new_all"]
ax.bar(new_x, y, label="All newly created", color="#f29d4b")

# Set Size
plt.legend(loc="upper left",fontsize=12, frameon = False)
plt.rcParams["figure.figsize"] = (10,7)

# Set ticks
plt.xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))

# Original Graph
plt.savefig("./project/graph/All_New_Created_Project_Orig.pdf", facecolor='white', transparent=False)
plt.savefig("./project/graph/All_New_Created_Project_Orig.png", facecolor='white', transparent=False)

# Log Scaled Graph
#plt.yscale('log')
#plt.savefig("./project/graph/All_New_Created_Project_Log.pdf", facecolor='white', transparent=False)
#plt.savefig("./project/graph/All_New_Created_Project_Log.png", facecolor='white', transparent=False)
#plt.show()
plt.clf()

<Figure size 720x504 with 0 Axes>

## 4. Plot Ties
Input: '/data2/zihe/data/network/gender_homophily/*.csv' (copied to './tie/data')<br>
Output: Tie distribution count by window: './tie/graph'

In [198]:
def plot_ties(lang):
    
    # Import data
    dat = pd.read_csv('./tie/data/'+lang+'.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)
    max_win = 45
    dat = dat[dat['win']<=max_win]
    
    # Change window to date
    wins = dat["win"]
    new_wins = []
    for win in wins:
        time = 3 * win
        year = 2008 + math.floor(time/12)
        month = time - math.floor(time/12)*12
        if not month:
            month = 12
        if month == 3:
            new_wins.append("{}".format(year))
        else:
            new_wins.append("{}-{}".format(year,month))
    wins = new_wins
    
    # Plot information
    fig,ax = plt.subplots()
    title = lang
    if lang == "C#":
        title = "C\#"
    ax.set_title("Gender-Identifiable Tie Count in "+ r"$\bf{" + title + "}$" +" Public Projects", fontsize=18)
    ax.set_xlabel('Time (quarter)', fontsize=16)
    ax.set_ylabel('Female-Related Tie Percentage', fontsize=16, labelpad=5.0)
    
    # Plot tie number bar
    total = dat["m_m"].replace(0, 1) + dat["f_m"] + dat["f_f"]
    m_m = (dat["m_m"] + dat["f_m"] + dat["f_f"]) / total * 100
    f_m = (dat["f_m"] + dat["f_f"]) / total * 100
    f_f = dat["f_f"] / total * 100
    #ax.bar(wins, m_m , label = "male-male", color = "steelblue")
    ax.bar(wins, f_m, label = "female-male", color = "#f29d4b")
    ax.bar(wins, f_f, label = "female-female", color = "#de2d26")
    plt.legend(loc=(0.02, 0.85),fontsize=12, frameon=False)
    
    # Plot ratio 
    ax2=ax.twinx()
    f_m = f_m.replace(0, 1)
    ratio = f_f / f_m
    ax2.plot(wins, ratio,color="darkblue",marker="o", markerfacecolor='white', markeredgecolor='darkblue', markeredgewidth=3, label="Among all")
    ax2.set_ylabel('Female Ratio = f-f / (f-f + f-m)', fontsize=16, labelpad=25.0).set_rotation(270)
    
     # Set x ticks
    ax.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    ax2.set_xticks(np.arange(min(dat["win"])-1, max(dat["win"])+1, 4))
    
    # Set y ticks
    upper = math.ceil(max(f_m)/30)*30
    ax.set_yticks(np.arange(0, upper *7/6, upper/6))
    ax.set_ylim(ymin = 0, ymax = upper)
    ax2.set_yticks(np.arange(0, 0.31, 0.03))
    ax2.set_ylim(ymin = 0, ymax = 0.30)
    
    # Set Size
    plt.legend(loc=(0.6, 0.85),fontsize=12, frameon=False)
    plt.rcParams["figure.figsize"] = (10,7)
    #plt.figtext(0.1, 0.02,"* percentage calculated out of total number of male-male, female-male, female-female ties")
    
    # No Frame
    ax.spines['top'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax2.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    
    plt.savefig("./tie/graph/"+lang+"_tie.pdf", facecolor='white', transparent=False)
    plt.savefig("./tie/graph/"+lang+"_tie.png", facecolor='white', transparent=False)
    #plt.show()
    plt.clf()


In [199]:
langs = ["JavaScript", "Python", "Java", "Go", "Ruby", "C++", "TypeScript", 
          "PHP", "C#", "C", "HTML", "CSS", "Jupyter", "Shell", "Objective-C"]
for lang in langs:
    plot_ties(lang)

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>

## 5. Plot Names
Input: '/data2/zihe/data/OSS-census/top_names.csv' (copied to './tie/data')<br>
Output: Name distribution count by window: './name'

In [16]:
def plot_name(gender):
    dat = pd.read_csv('./name/'+gender+'.csv', error_bad_lines=False, warn_bad_lines=False, index_col=False)

    name = []
    freq = []

    for i in range(0, 16):
        name.append(dat.loc[i,"name"])
        freq.append(dat.loc[i,"count"])

    fig,ax = plt.subplots()
    ax.bar(name, freq, color = "#f29d4b")

    ax.set_title("15 Most Common " + r"$\bf{" + gender + "}$" +" Developer Names in GHTorrent", fontsize=18)
    ax.set_xlabel('Name', fontsize=16)
    ax.set_ylabel('Number of Appearance', fontsize=16, labelpad=5.0)

    plt.rcParams["figure.figsize"] = (10,7)
    plt.xticks(rotation=30)
    ran = 800 if gender == "Female" else 8000
    ax.set_yticks(np.arange(0, 10 * ran + 1, ran))
    
    # No Frame
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.savefig("./name/"+gender+"_name.png", facecolor='white', transparent=False)
    plt.savefig("./name/"+gender+"_name.pdf", facecolor='white', transparent=False)
    #plt.show()
    plt.clf()
    
plot_name("Female")
plot_name("Male")

<Figure size 720x504 with 0 Axes>

<Figure size 720x504 with 0 Axes>