In [1]:
%matplotlib inline
import json, pandas as pd

In [2]:
stemming = {};
with open("stemming.txt", 'r') as stemming_f:
    for stem_group in stemming_f.readlines():
        stem_tokens = stem_group.replace("\n", "").split(",")
        stem_target = stem_tokens.pop(0)
        for stem_token in stem_tokens:
            stemming[stem_token] = stem_target

process_skill_name = lambda name: stemming.get(name.lower(), name.lower())

In [3]:
with open("personal_track/skills_hard.json", 'r') as hsf:
    raw_hard_skills = json.load(hsf)
    
with open("personal_track/skills_soft.json", 'r') as ssf:
    raw_soft_skills = json.load(ssf)

In [4]:
clean_jsf = {"job":[], "job_rating":[], "skill_type":[], "skill_name":[], "skill_freq":[]}

for skill_type, skills in {"hard": raw_hard_skills, "soft": raw_soft_skills}.items():
    for job, job_data in skills.items():
        for skill_name, frequency in job_data["tasks_and_frequencies"].items():
            clean_jsf["job"].append(job);
            clean_jsf["job_rating"].append(job_data["rating"]);
            clean_jsf["skill_type"].append(skill_type);
            clean_jsf["skill_name"].append(process_skill_name(skill_name));
            clean_jsf["skill_freq"].append(int(frequency));

job_df = pd.DataFrame(clean_jsf)
job_df["job"] = job_df["job"].astype('category')
job_df["skill_name"] = job_df["skill_name"].astype('category')
job_df["skill_type"] = job_df["skill_type"].astype('category')
job_df = job_df.groupby(["job","skill_name"]).agg({"skill_freq":"sum","job_rating":"first","skill_type":"first"}).reset_index().dropna()

# Recommending category weights

In [5]:
from CurriculumVitae import categories

In [6]:
cat_bias = {"Writing":1, "Code":1, "Project":0.1, "Soft":2}

In [7]:
cat_inverse = {}
for cat, skills in categories.items():
    for skill in skills:
        cat_inverse.setdefault(process_skill_name(skill), set()).add(cat)

In [8]:
jcw = job_df.copy()
jcw["skill_category"] = jcw.apply(lambda skill: cat_inverse.get(skill["skill_name"], {"UNK"}), axis=1)
jcw["skill_freq"] = jcw.apply(lambda skill: skill["skill_freq"] / (len(skill["skill_category"])), axis=1)
jcw = jcw.explode("skill_category")

In [9]:
job_category_weights = jcw.groupby(["job", "skill_category"]).agg({"skill_freq":"sum"}).reset_index()
job_category_weights = job_category_weights.loc[job_category_weights["skill_category"] != "UNK"]
job_category_weights["skill_category"] = job_category_weights["skill_category"].astype('category')
job_category_weights["skill_category"] = job_category_weights["skill_category"].cat.set_categories(new_categories=categories.keys())

In [10]:
# jcw.loc[jcw["job"] == "AFRY"].sort_values("skill_freq", ascending=False)

In [11]:
jobcat_args = []
for job in job_category_weights["job"].cat.categories:
    category_weights = job_category_weights.loc[job_category_weights["job"]==job]
    exports = {};
    for cat in job_category_weights["skill_category"].cat.categories:
        job_cat_freqs = category_weights.loc[category_weights["skill_category"] == cat, "skill_freq"].values
        if not len(job_cat_freqs):
            job_cat_freqs = 0
        else:
            job_cat_freqs = job_cat_freqs[0]
        exports[cat] = job_cat_freqs + cat_bias.get(cat, 0)
    
    jobcat_args.append({"name":job, "exports":exports})

In [12]:
with open("personal_track/jobs_cats.json", 'w+') as jobs_cats_f:
    json.dump(jobcat_args, jobs_cats_f)

# ?

In [None]:
from IPython.display import display, Markdown

In [None]:
job_total_skills = job_df.groupby(["job"]).agg({"skill_freq": "sum"})
job_df["skill_share"] = job_df.apply(lambda t: t["skill_freq"] / job_total_skills.loc[t["job"]], axis=1)

rate_weight = lambda r: r["job_rating"]**2

job_df["skill_weighted"] = job_df["skill_share"] * rate_weight(job_df)

skill_weights = job_df.groupby(["skill_type", "skill_name"]).agg({"skill_weighted": "sum"}).reset_index().sort_values("skill_weighted", ascending=False, ignore_index=True)
skill_weights["skill_of_max"] = skill_weights["skill_weighted"] / skill_weights["skill_weighted"].max()
skill_weights["skill_normed"] = skill_weights["skill_weighted"] / skill_weights["skill_weighted"].sum()

In [None]:
def check_entry(skill):
    global coverage
    is_integrated = skill["skill_name"] in integrated_skills
    style = '~~' if is_integrated else '**'
    check = f"- [{'X' if is_integrated else ' '}] {style}{skill['skill_name']}{style}"
    score = f"- {style}{100*skill['skill_normed']:.2f}%{style}"
    return f"{check}\n\t{score}"
    
    

In [None]:
with open("integrated_skills.txt", 'r') as int_s:
    integrated_skills = set(map(lambda s: s.replace("\n", ""), int_s.readlines()))
coverage = skill_weights.loc[skill_weights["skill_name"].isin(integrated_skills), "skill_normed"].sum()
    
checklist = "\n".join(check_entry(skill) for index, skill in skill_weights.loc[skill_weights["skill_normed"] > 0.001].iterrows())    
# Markdown(f"# Integrated Skills\nCoverage: {coverage*100:.2f}%\n{checklist}")