In [12]:
from info.all_info import get_clean_records_for_india, get_raw_records

import pandas as pd
import altair as alt
import numpy as np

In [10]:
df = get_clean_records_for_india()
df = df[["href", "post_title", "date", "post", "company",
         "title", "yoe", "salary_total", "location"]]

# prep formatted columns
df = df.loc[df["salary_total"] != -1, :]
df["lpa"] = df["salary_total"] / 1_00_000
df["Years of Experience (bucket)"] = \
    pd.cut(df["yoe"], bins=[-0.1, 1, 3, 6, 9, 15, 100],
           labels=["0-1", "1-3", "3-6", "6-9", "9-15", "15+"])
df.rename(columns={"yoe": "Years of Experience"}, inplace=True)
df["company"] = df["company"].str.upper()
df["title"] = df["title"].str.upper()

# salary percentiles
p75, p95, p99 = (
    np.percentile(df["lpa"], 75),
    np.percentile(df["lpa"], 95),
    np.percentile(df["lpa"], 99)
)
pdf = pd.DataFrame({'p75': [p75], 'p95': [p95], 'p99': [p99]})

2021-08-05 12:34:23.257 | INFO     | info.all_info:get_raw_records:99 - n records: 4134


In [14]:
DARK_BACKGROUND = "#22272D"
LIGHT_BAR = "mediumseagreen"
DARK_BAR = "#F9A089"

In [27]:
df

Unnamed: 0,href,post_title,date,post,company,title,Years of Experience,salary_total,location,lpa,Years of Experience (bucket)
1,https://leetcode.com/discuss/compensation/1089...,Phonepe | SDE | Bangalore,2021/03/02,Education: B.Tech in ECE\nYears of Experience:...,PHONEPE,SDE 1,0.000000,3050000.0,bangalore,30.50,0-1
2,https://leetcode.com/discuss/compensation/1231...,Cure.fit | SDE 1 | Bangalore,2021/05/26,Education: B.Tech from Tier-1\nYears of Experi...,CURE.FIT,SDE 1,0.000000,2050000.0,bangalore,20.50,0-1
3,https://leetcode.com/discuss/compensation/9176...,Sprinklr | SE2 | Gurgaon,2020/10/31,Education: BTech in Computer Science from Deem...,SPRINKLR,PRODUCT ENGINEER 1,0.000000,2500000.0,delhi,25.00,0-1
6,https://leetcode.com/discuss/compensation/1205...,HashedIn | SDE 1 | Bangalore,2021/05/13,Education: Tier 2 Engineering College (MCA)\nY...,DELOITTE,SDE 1,2.000000,1100000.0,bangalore,11.00,1-3
9,https://leetcode.com/discuss/compensation/6079...,Ninjacart | SDE2 | Bangalore,2020/05/02,Education: Bachelor's in I.T. from a tier-1 un...,NINJACART,SDE 2,4.500000,2850000.0,bangalore,28.50,3-6
...,...,...,...,...,...,...,...,...,...,...,...
4117,https://leetcode.com/discuss/compensation/1330...,Innominds | New grad | Hyderabad,2021/07/11,Education: B.Tech(IT) from NITs(Tier1 college)...,INNOMINDS,DATA ENGINEER 1,0.000000,700000.0,hyderabad,7.00,0-1
4125,https://leetcode.com/discuss/compensation/6474...,Cashfree | Product Engineer 2 | Bangalore,2020/05/23,Education: Tier 4 college\nYears of Experience...,CASHFREE,,5.000000,3079000.0,bangalore,30.79,3-6
4126,https://leetcode.com/discuss/compensation/9055...,Amazon | SDE I | Bangalore,2020/10/22,Education : Master's in CS from Tier I institu...,AMAZON,SDE 1,1.166667,2225000.0,bangalore,22.25,1-3
4129,https://leetcode.com/discuss/compensation/4772...,Databricks | Senior Cloud Engineer | Bengaluru,2020/01/11,Education: B.E from a top tier university in I...,DATABRICKS,,6.166667,4982000.0,hyderabad,49.82,6-9


In [26]:
df["lpa"]

1       30.50
2       20.50
3       25.00
6       11.00
9       28.50
        ...  
4117     7.00
4125    30.79
4126    22.25
4129    49.82
4131    22.46
Name: lpa, Length: 1705, dtype: float64

In [None]:
import seaborn as

In [25]:
# salary distribution ----------------------------------------------------------
bar = alt.Chart(df).mark_bar(size=23).encode(
    x=alt.X('lpa', bin=alt.Bin(maxbins=20), title="₹ LPA"),
    y=alt.Y('count()', axis=alt.Axis(title="Count of Records")),
    color=alt.value(LIGHT_BAR),
).properties(width=900, height=350)
rule1 = alt.Chart(pdf).mark_rule(color='black', strokeDash=[2, 2], size=1.5).encode(x="p75:Q")
rule2 = alt.Chart(pdf).mark_rule(color='grey', strokeDash=[2, 2]).encode(x="p95:Q")
rule3 = alt.Chart(pdf).mark_rule(color='grey', strokeDash=[2, 2]).encode(x="p99:Q")
final_bar = (bar + rule1 + rule2 + rule3)

final_bar