# Visualization

In [197]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
import plotly.graph_objs as go1

In [198]:
h1b_df = pd.read_csv("h1b_df.csv")


Columns (9) have mixed types.Specify dtype option on import or set low_memory=False.



In [199]:
h1b_df.columns

Index(['Unnamed: 0', 'CASE_NUMBER', 'DECISION_DATE', 'CASE_STATUS', 'REFILE',
       'EMPLOYER_NAME', 'EMPLOYER_STATE', 'PW_SOC_TITLE', 'PW_LEVEL_9089',
       'PW_AMOUNT_9089', 'PW_UNIT_OF_PAY_9089', 'JOB_INFO_JOB_TITLE',
       'JOB_INFO_EDUCATION', 'JOB_INFO_EXPERIENCE',
       'JOB_INFO_EXPERIENCE_NUM_MONTHS', 'COUNTRY_OF_CITIZENSHIP',
       'CLASS_OF_ADMISSION', 'Year'],
      dtype='object')

In [200]:
h1b_df.drop("Unnamed: 0", axis = 1, inplace=True)

In [201]:
h1b_df.head()

Unnamed: 0,CASE_NUMBER,DECISION_DATE,CASE_STATUS,REFILE,EMPLOYER_NAME,EMPLOYER_STATE,PW_SOC_TITLE,PW_LEVEL_9089,PW_AMOUNT_9089,PW_UNIT_OF_PAY_9089,JOB_INFO_JOB_TITLE,JOB_INFO_EDUCATION,JOB_INFO_EXPERIENCE,JOB_INFO_EXPERIENCE_NUM_MONTHS,COUNTRY_OF_CITIZENSHIP,CLASS_OF_ADMISSION,Year
0,A-16270-56093,2017-01-03,Denied,N,Union General Hospital,GA,Medical and Clinical Laboratory Technologists,Level I,19.84,Hour,Medical Technologist,Bachelor's,Y,36.0,PHILIPPINES,H-1B,2017
1,A-16281-60165,2017-01-03,Denied,N,TYCO ELECTRONICS CORPORATION,PA,Computer Systems Analysts,Level III,94598.0,Year,IT Auditor,Bachelor's,Y,24.0,INDIA,H-1B,2017
2,A-16267-55773,2017-01-03,Denied,N,WAYNSYS INC,NJ,Network and Computer Systems Administrators*,Level I,61734.0,Year,NETWORK ENGINEER/ADMINISTRATOR,Bachelor's,N,,INDONESIA,H-1B,2017
3,A-16202-34492,2017-01-03,Denied,N,"PRUTECH SOLUTIONS, INC",NJ,Computer Systems Analysts,Level IV,116709.0,Year,Sr. Business Systems Analyst I,Master's,N,,INDIA,H-1B,2017
4,A-15265-20526,2017-01-03,Denied,N,CLOUD SHERPAS INC.,GA,Computer Systems Analysts,Level IV,100422.0,Year,Senior Technical Consultant,Bachelor's,Y,60.0,INDIA,H-1B,2017


In [202]:
h1b_df_c = h1b_df[h1b_df["CASE_STATUS"]=="Certified"]
h1b_df_c=h1b_df_c.groupby("Year")["CASE_STATUS"].count().reset_index()
fig = px.line(h1b_df_c, x='Year', y='CASE_STATUS', title="CERTIFIED CASES FROM 2017-2021")
fig.show()

The plot explains an upward trend in certified H-1B cases from 2017-2021. As we know, the number of H-1B approvals dropped during Trump Administration and increased significantly in the recent past.

In [217]:
lst = list(h1b_df.groupby('Year'))
rows = 2
cols = 3
subplot_titles = [l[0] for l in lst]
specs = [[{'type':'domain'}] * cols] * rows
fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        specs=specs)

for i, l in enumerate(lst):
    row = i // cols + 1
    col = i % (rows + 1) + 1
    d = l[1]
    a=list(set(d["CASE_STATUS"]))
    a.sort()
    fig.add_trace(
        go.Pie(labels=a,
               values=list(d["CASE_STATUS"].value_counts()),
               showlegend=False,
               textposition='inside',
               textinfo='label+percent'),
         row=row,
         col=col
    )
    
fig.update_layout(title="Certified vs Denied Every Year", title_x=0.5)
fig.show()

Plotted approvals vs denied from 2017 - 2021. Plotted subplots with multiple pie charts for every year. In the year 2018, highest number of H1B cases were denied. 1 out of every 4 application was denied. While 2018 was the year with highest denial rate, 2020 and 2021 have highest approval rate. It might be due to various reasons such as Presidential elections, change in government etc. 

In [218]:
h1b_df_c = h1b_df[h1b_df["CASE_STATUS"]=="Certified"]
h1b_df_c = h1b_df_c.groupby("EMPLOYER_STATE")["CASE_NUMBER"].count().sort_values(ascending=False).reset_index().head(10)
fig = px.bar(h1b_df_c, x='EMPLOYER_STATE', y='CASE_NUMBER', title="States with highest certified cases")
fig.show()

California has always been hub for the software industry providing job opportunities for majority of the immigrants. It has many companies which hire huge number of employees every year. Therefore, California is the state which has highest number of certified H-1B cases leading by a large margin. The state next to California is Texas with less than half approvals of California. Texas has always been a hub to foreigners, especially Indians. Majority of Indians living in US settle in either Texas or California. Other states among top 10 include New York, New Jersey, Washington, Virginia etc. 

In [205]:
labels = list(h1b_df_e["EMPLOYER_NAME"])
h1b_df_e = h1b_df.groupby("EMPLOYER_NAME")["CASE_NUMBER"].count().sort_values(ascending=False).reset_index().head(9)
fig = px.pie(h1b_df_e, names='EMPLOYER_NAME', labels=labels , values='CASE_NUMBER', title="Top 10 companies sponsoring H1B")
fig.show()

Tech Giants such as Microsoft, Google, Facebook, Amazon tops the list of H-1B sponsors. Along with such Product based companies, other Service based consulting companies which sponsor H-1Bs include Tata Consultancy Services, Infosys, Cognizant etc. All these established software companies have high H-1B sponsorship. If an individual gets a job in one of these companies, the probability of their H1B being sponsored would be very high. 

In [206]:
#Removing commas from salary and converting to float types
h1b_df["PW_AMOUNT_9089"] = h1b_df["PW_AMOUNT_9089"].str.replace(',', '')
h1b_df["PW_AMOUNT_9089"] = h1b_df["PW_AMOUNT_9089"].astype('float')

In [207]:
#Filtering data based upon salary ranges
h1b_df_c = h1b_df[h1b_df["CASE_STATUS"]=="Certified"]
h1b_df_d = h1b_df[h1b_df["CASE_STATUS"]=="Denied"]

h1b_df_c

c_70 = h1b_df_c[h1b_df_c["PW_AMOUNT_9089"] <70000]
c_70_80 = h1b_df_c[(h1b_df_c["PW_AMOUNT_9089"] >70000) & (h1b_df_c["PW_AMOUNT_9089"] <80000)]
c_80_90 = h1b_df_c[(h1b_df_c["PW_AMOUNT_9089"] >80000) & (h1b_df_c["PW_AMOUNT_9089"] <90000)]
c_90_100 = h1b_df_c[(h1b_df_c["PW_AMOUNT_9089"] >90000) & (h1b_df_c["PW_AMOUNT_9089"] <100000)]
c_100 = h1b_df_c[h1b_df_c["PW_AMOUNT_9089"] > 100000]

d_70 = h1b_df_d[h1b_df_d["PW_AMOUNT_9089"] <70000]
d_70_80 = h1b_df_d[(h1b_df_d["PW_AMOUNT_9089"] >70000) & (h1b_df_d["PW_AMOUNT_9089"] <80000)]
d_80_90 = h1b_df_d[(h1b_df_d["PW_AMOUNT_9089"] >80000) & (h1b_df_d["PW_AMOUNT_9089"] <90000)]
d_90_100 = h1b_df_d[(h1b_df_d["PW_AMOUNT_9089"] >90000) & (h1b_df_d["PW_AMOUNT_9089"] <100000)]
d_100 = h1b_df_d[h1b_df_d["PW_AMOUNT_9089"] > 100000]


In [219]:
c = [len(c_70),len(c_70_80),len(c_80_90),len(c_90_100),len(c_100)]
d = [len(d_70),len(d_70_80),len(d_80_90),len(d_90_100),len(d_100)]

sal = ["<70k","70k-80k","80k-90k","90k-100k",">100k"]

fig = go.Figure(data=[
    go.Bar(name='Certified', x=sal, y=c),
    go.Bar(name='Denied', x=sal, y=d),
])

fig.update_layout(barmode='stack')
fig.show()

As seen in the earlier plot, highest H1B sponsorship corresponds to renowned established companies like Amazon, Microsoft, Google Apple etc. Undoubtedly, these companies offer best salaries in the industry. Almost every employee working in these companies receive salaries greater than 100k. Hence, H1B sponsorship for salaries greater than 100k is much higher compared to salaries less than 100k. To distinguish amongst various salary levels, dataset has been filtered multiple times corresponding to the salary range. 

In [221]:
h1b_job=h1b_df.groupby("JOB_INFO_EDUCATION")["CASE_NUMBER"].count().sort_values(ascending=False).reset_index()
fig = go.Figure(data=[go.Pie(labels=list(h1b_job["JOB_INFO_EDUCATION"]), values=list(h1b_job["CASE_NUMBER"]), pull=[0, 0, 0.2, 0.2,0.2,0.2,0.2])])
fig.show()

More than 90% of H1B applicants have either earned a Masters or a Bachelors degree. Most of the immigrants dream to study higher education in USA because of top notch facilities and faculty. Hence, the ratio is higher. Other educational backgrounds include Doctorate, High School grads etc. However, their proportion is very minimal compared to that of Masters and Bachelors. Masters quota is leading with 50.4% while the Bachelors quota has nearly 42% applications. 

In [210]:
h1b_title=h1b_df.groupby("PW_SOC_TITLE")["CASE_NUMBER"].count().sort_values(ascending=False).reset_index().head(10)

fig = px.pie(h1b_title, values='CASE_NUMBER', names='PW_SOC_TITLE', title='Job Titles Statistics', hole=.3)
fig.show()

To verify if the tendencies of H1B sponsors being software giants is true, let us plot a graph corresponsing to job titles. If most of the jobs titles relate to software industry, our insights holds true. As per our estimate, job titles corresponding to Software Industry lead the race of H1B approvals. Majority of applicants are Software Developers. Other job titles include System Engineers, Data Analysts, Statisticians etc. Few other industries include Electronics, Mechanical and Accounting sectors. 

In [211]:
h1b_coun = h1b_df.groupby("COUNTRY_OF_CITIZENSHIP")["CASE_NUMBER"].count().sort_values(ascending=False).reset_index()

In [212]:
#Using pycountry method to map ISO codes of each country
import pycountry
def do_fuzzy_search(country):
    try:
        result = pycountry.countries.search_fuzzy(country)
    except Exception:
        return np.nan
    else:
        return result[0].alpha_3

In [213]:
h1b_coun["iso_code"] = h1b_coun["COUNTRY_OF_CITIZENSHIP"].apply(lambda country: do_fuzzy_search(country))
h1b_coun

Unnamed: 0,COUNTRY_OF_CITIZENSHIP,CASE_NUMBER,iso_code
0,INDIA,136661,IND
1,CHINA,26567,CHN
2,CANADA,4848,CAN
3,SOUTH KOREA,2609,KOR
4,PHILIPPINES,1759,PHL
...,...,...,...
176,QATAR,1,QAT
177,TURKS AND CAICOS ISLANDS,1,TCA
178,MONTSERRAT,1,MSR
179,SOLOMON ISLANDS,1,SLB


In [214]:
fig = go.Figure(data=go.Choropleth(
    locations = h1b_coun['iso_code'],
        z = h1b_coun['CASE_NUMBER'],
    text = h1b_coun['COUNTRY_OF_CITIZENSHIP'],
    colorscale = 'Blues',
    autocolorscale=True,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=1,

))
fig.show()

The data also includes countries of citizenship. Therefore, based upon this data, we can draw insights about the data of where these immigrants are coming from. As per the data, India is the leading country with utmost majority followed by China. As per the statistics, the population of India and China are very high. We can hence assume that the population dreaming about studying abroad is higher. Other top countries include Canada, South Korea etc. A choropleth map has been plotted to understand the results.