In [None]:
from collections import Counter

import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

# avoid burning my eyes @ night
plt.style.use("dark_background")

In [None]:
FILE = "data/survey_results_public.csv"
so_df = pd.read_csv(FILE)

print(so_df.keys())
so_df.describe()

# print(so_df[:3])

In [None]:
# get popularity of different programming languages

#keys re: languages are:
#LanguageHaveWorkedWith,LanguageWantToWorkWith,LanguageAdmired,LanguageDesired

# draw horizontal bar plot
# https://seaborn.pydata.org/examples/part_whole_bars.html

# draw as strip chart
# https://seaborn.pydata.org/generated/seaborn.stripplot.html#seaborn.stripplot

def get_langs(dataset, key="LanguageHaveWorkedWith"):
    lang_count = Counter()
    assert(key in dataset.keys())
    for response in dataset[key]:
        if type(response) == str:
            lang_count.update(response.split(';'))
    langs_by_popularity = dict(
        sorted(lang_count.items(), key=lambda item: item[1], reverse=True)
    )
    return langs_by_popularity

def visualize_langs(langs, langs2, label1 = "condition1", label2 = "condition2"):
    DOT_COLOR1 = "lightblue"
    DOT_COLOR2 = "red"
    BG_COLOR   = "black" 
    df    = pd.DataFrame(langs.items(), columns=['Languages', 'Count'])
    df2   = pd.DataFrame(langs2.items(), columns=['Languages', 'Count'])
    
    plt.figure(figsize=(10,15)) 
    
    sb.stripplot(x='Count', y='Languages', data=df, \
                 size=5, color=DOT_COLOR1, label="have worked with", jitter=True)
    sb.stripplot(x='Count', y='Languages', data=df2, \
                 size=5, color=DOT_COLOR2, label="want to work with", jitter=True)
    
    # chatgpt draws my legend
    # Create custom legend handles to avoid duplicates
    # color = 'w' means do not draw line bissecting point
    blue_patch = plt.Line2D(
        [0], [0], marker='o', color=BG_COLOR, \
        label=label1, markerfacecolor=DOT_COLOR1, markersize=10)
    red_patch = plt.Line2D(
        [0], [0], marker='o', color=BG_COLOR, \
        label=label2, markerfacecolor=DOT_COLOR2, markersize=10)
    
    # Show the legend with custom handles
    plt.legend(handles=[blue_patch, red_patch], loc="center right")
    
    plt.grid(axis='x', linestyle='--', alpha=0.75) 
    plt.title("%s vs %s" % (label1, label2))
    del df, df2

l1 = get_langs( so_df )
l2 = get_langs( so_df, "LanguageWantToWorkWith" )
visualize_langs(l1,l2, label1="have worked with", label2="want to work with")

l3 = get_langs( so_df, "LanguageAdmired")
l4 = get_langs( so_df, "LanguageWantToWorkWith")
visualize_langs(l3, l4, label1="admired", label2="want to work with")

# determine extrinsic vs intrinsic motivation
def get_difference(dict1, dict2):
    keys = dict1.keys()
    result = dict()
    for key in keys:
        result[key] = dict1[key] - dict2[key]
    return result
    
motiv_diff = get_difference(l2, l1)
print(motiv_diff)

# determine level of hype
hype = get_difference(l3, l4)
print(hype)


In [None]:
# print survey ans
employment_status = Counter(so_df["MainBranch"])
print(employment_status)

print(so_df["ConvertedCompYearly"][:3])