# Data Wrangling

https://www.cbp.gov/newsroom/stats

https://www.ice.gov/detain/detention-management

https://www.uscis.gov/tools/reports-and-studies/immigration-and-citizenship-data

https://ohss.dhs.gov/topics/immigration#other-resources 


# Ideas
- Write about H1B, H2A, H2B, or other visas
- Write about detained individual counts at the border
- Write about detained individuals within the US
- Forecast immigration data
- Forecast impacts of policy changes

- Combine company visa information with their stock information
- Investigate if there are any correlations between stock information and visa information
- Can we make price movement predictions with visa information of publicly listed companies



In [1]:
# Imports
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from yfinance import ticker
import json
from io import StringIO
import re

In [None]:
# Gets a list of all listed companies from SEC's EDGAR database
#"https://www.sec.gov/files/company_tickers.json"

with open('data/company_tickers.json', 'r') as f:
    data = json.load(f)
#companies = [entry["ticker"] for entry in response.values()]
#print(companies[:10])  # Print first 10 tickers

{'0': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '1': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '2': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '7': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '8': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '9': {'cik_str': 104169, 'ticker': 'WMT', 'title': 'Walmart Inc.'},
 '10': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '11': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'},
 '12': {'cik_str': 1403161, 'ticker': 'V', 'title': 'VISA INC.'},
 '13': {'cik_str': 884394, 'ticker': 'SPY', 'title': 'S

In [3]:

h1b_2009_path = 'data/h1b_2016_2009.csv'
h1b_2017_path = 'data/h1b_2024_2017.csv'

h2a_2015_path = 'data/h2a_2019_2015.csv'
h2a_2020_path = 'data/h2a_2024_2020.csv'

h2b_2015_path = 'data/h2b_2019_2015.csv'
h2b_2020_path = 'data/h2b_2025_2020.csv'

#Had to convert all encoding to UTF8 via notepad++
def read_tsv(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return pd.read_csv(StringIO(f.read()), delimiter="\t")

# Read all files using the function
df_h1b_a = read_tsv(h1b_2009_path)
df_h1b_b = read_tsv(h1b_2017_path)

df_h2a_a = read_tsv(h2a_2015_path)
df_h2a_b = read_tsv(h2a_2020_path)

df_h2b_a = read_tsv(h2b_2015_path)
df_h2b_b = read_tsv(h2b_2020_path)


  return pd.read_csv(StringIO(f.read()), delimiter="\t")
  return pd.read_csv(StringIO(f.read()), delimiter="\t")
  return pd.read_csv(StringIO(f.read()), delimiter="\t")
  return pd.read_csv(StringIO(f.read()), delimiter="\t")


In [4]:
df_h1b = pd.concat([df_h1b_a, df_h1b_b], axis=0)

df_h2a = pd.concat([df_h2a_a, df_h2a_b], axis=0)

df_h2b = pd.concat([df_h2b_a, df_h2b_b], axis=0)


In [5]:
print(df_h1b.columns)
print(df_h2a.columns)
print(df_h2b.columns)

Index(['Line by line', 'Fiscal Year   ', 'Employer (Petitioner) Name',
       'Tax ID', 'Industry (NAICS) Code', 'Petitioner City',
       'Petitioner State', 'Petitioner Zip Code', 'Initial Approval',
       'Initial Denial', 'Continuing Approval', 'Continuing Denial'],
      dtype='object')
Index(['Index()', 'Action Fiscal Year', 'Employer (Petitioner) Name', 'Tax ID',
       'Industry', 'Occupation (SOC) Code', 'Petitioner City',
       'Petitioner State', 'Petitioner Zip Code', 'Worksite State',
       'Consular_processed', 'Wage Rate Band', 'New Employment Approval',
       'New Employment Denial', 'Continuation Approval', 'Continuation Denial',
       'Change with Same Employer Approval',
       'Change with Same Employer Denial', 'New Concurrent Approval',
       'New Concurrent Denial', 'Change of Employer Approval',
       'Change of Employer Denial', 'Amended Approval', 'Amended Denial'],
      dtype='object')
Index(['Index()', 'Cap Fiscal Year', 'Cap Type', 'Employer (Petiti

In [6]:
titles = [company['title'].upper() for company in data.values()]

In [7]:
def remove_meta_characters(input_string):
    # Define a regex pattern to match all meta characters
    pattern = r'[^\w\s]'
    # Substitute meta characters with an empty string
    cleaned_string = re.sub(pattern, '', input_string)
    return cleaned_string

In [8]:
pattern = r'[^\w\s]'
df_h1b['employer'] = df_h1b['Employer (Petitioner) Name'].str.upper()
df_h2a['employer'] = df_h2a['Employer (Petitioner) Name'].str.upper()
df_h2b['employer'] = df_h2b['Employer (Petitioner) Name'].str.upper()

In [9]:
print('H1B Crossover Count ' + str(sum(df_h1b["employer"].isin(titles))))
print('H2A Crossover Count ' + str(sum(df_h2a["employer"].isin(titles))))
print('H2B Crossover Count ' + str(sum(df_h2b["employer"].isin(titles))))

H1B Crossover Count 5245
H2A Crossover Count 53
H2B Crossover Count 4


In [24]:
df_h1b_listed = df_h1b[ df_h1b["employer"].isin(titles) ]
df_h1b_listed['type'] = 'h1b'

df_h2a_listed = df_h2a[ df_h2a["employer"].isin(titles) ]
df_h2a_listed['type'] = 'h2a'

df_h2b_listed = df_h2b[ df_h2b["employer"].isin(titles) ]
df_h2b_listed['type'] = 'h2b'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h1b_listed['type'] = 'h1b'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h2a_listed['type'] = 'h2a'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h2b_listed['type'] = 'h2b'


In [25]:
print(df_h1b.columns)
print(df_h2a.columns)
print(df_h2b.columns)

Index(['Line by line', 'Fiscal Year   ', 'Employer (Petitioner) Name',
       'Tax ID', 'Industry (NAICS) Code', 'Petitioner City',
       'Petitioner State', 'Petitioner Zip Code', 'Initial Approval',
       'Initial Denial', 'Continuing Approval', 'Continuing Denial',
       'employer'],
      dtype='object')
Index(['Index()', 'Action Fiscal Year', 'Employer (Petitioner) Name', 'Tax ID',
       'Industry', 'Occupation (SOC) Code', 'Petitioner City',
       'Petitioner State', 'Petitioner Zip Code', 'Worksite State',
       'Consular_processed', 'Wage Rate Band', 'New Employment Approval',
       'New Employment Denial', 'Continuation Approval', 'Continuation Denial',
       'Change with Same Employer Approval',
       'Change with Same Employer Denial', 'New Concurrent Approval',
       'New Concurrent Denial', 'Change of Employer Approval',
       'Change of Employer Denial', 'Amended Approval', 'Amended Denial',
       'employer'],
      dtype='object')
Index(['Index()', 'Cap Fisca

In [26]:
df_visas = pd.concat([df_h1b_listed, df_h2a_listed, df_h2b_listed])
df_visas

Unnamed: 0,Line by line,Fiscal Year,Employer (Petitioner) Name,Tax ID,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Initial Denial,...,Change of Employer Approval,Change of Employer Denial,Amended Approval,Amended Denial,Cap Fiscal Year,Cap Type,Work Site State,Consular_Processed,Hourly Wage,ETA Case Number
547,548,2016.0,ABBOTT LABORATORIES,8440.0,,NORTH CHICAGO,IL,60064.0,1,0,...,,,,,,,,,,
548,549,2016.0,ABBOTT LABORATORIES,8440.0,31-33 - Manufacturing,ABBOT PARK,IL,60064.0,0,0,...,,,,,,,,,,
549,550,2016.0,ABBOTT LABORATORIES,8440.0,31-33 - Manufacturing,ABBOTT PARK,IL,60064.0,2,0,...,,,,,,,,,,
550,551,2016.0,ABBOTT LABORATORIES,8440.0,31-33 - Manufacturing,NORTH CHICAGO,IL,60064.0,0,0,...,,,,,,,,,,
694,695,2016.0,ACADIA PHARMACEUTICALS INC,6651.0,"54 - Professional, Scientific, and Technical S...",SAN DIEGO,CA,92130.0,1,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69693,,,CHS INC,1095.0,,FREDERICK,OK,73542.0,,,...,0,0.0,0.0,0.0,,,,,,
3876,,,UTG INC,7892.0,72 - Accommodation and Food Services,Stanford,KY,40484.0,,,...,0.0,0.0,0.0,0.0,2024.0,1st Half,KY,Yes,$20.00 and more,H40023186166872
9245,,,GREEN LEAF INNOVATIONS INC,3426.0,56 - Administrative and Support and Waste Mana...,Raleigh,NC,27605.0,,,...,11.0,0.0,0.0,0.0,2024.0,Exempt,NC,No,$14.00 - $15.99,H40023276407455
9246,,,GREEN LEAF INNOVATIONS INC,3426.0,56 - Administrative and Support and Waste Mana...,Raleigh,NC,27605.0,,,...,0.0,0.0,0.0,0.0,2024.0,Exempt,NC,Yes,$14.00 - $15.99,H40023276407455


h1b_drop_cols = ['Line by line', 'Employer (Petitioner) Name', 'Tax ID', 'Initial Denial', 'Continuing Denial', 'Initial Approval', 'Continuing Approval']
#Adding the columns does not work since they are currently strings, to_numeric not working due to commas
df_h1b_listed['tot'] = df_h1b_listed['Initial Approval'] + df_h1b_listed['Continuing Approval']
df_h1b_clean = df_h1b_listed.drop(h1b_drop_cols, axis = 1)
df_h1b_clean

In [31]:
df_visas.columns


Index(['Line by line', 'Fiscal Year   ', 'Employer (Petitioner) Name',
       'Tax ID', 'Industry (NAICS) Code', 'Petitioner City',
       'Petitioner State', 'Petitioner Zip Code', 'Initial Approval',
       'Initial Denial', 'Continuing Approval', 'Continuing Denial',
       'employer', 'type', 'Index()', 'Action Fiscal Year', 'Industry',
       'Occupation (SOC) Code', 'Worksite State', 'Consular_processed',
       'Wage Rate Band', 'New Employment Approval', 'New Employment Denial',
       'Continuation Approval', 'Continuation Denial',
       'Change with Same Employer Approval',
       'Change with Same Employer Denial', 'New Concurrent Approval',
       'New Concurrent Denial', 'Change of Employer Approval',
       'Change of Employer Denial', 'Amended Approval', 'Amended Denial',
       'Cap Fiscal Year', 'Cap Type', 'Work Site State', 'Consular_Processed',
       'Hourly Wage', 'ETA Case Number'],
      dtype='object')

In [32]:
drop_cols = ['Line by line', 'Employer (Petitioner) Name', 'Tax ID', 'Initial Denial', 'Continuing Denial', 'Index()', 'Consular_processed', 'New Employment Denial', 'Continuation Denial', 'Change with Same Employer Denial', 'New Concurrent Denial', 'Change of Employer Denial', 'Amended Denial', 'Consular_Processed', 'ETA Case Number']
df_visas_clean = df_visas.drop(drop_cols, axis = 1)

In [33]:
df_visas_clean

Unnamed: 0,Fiscal Year,Industry (NAICS) Code,Petitioner City,Petitioner State,Petitioner Zip Code,Initial Approval,Continuing Approval,employer,type,Action Fiscal Year,...,New Employment Approval,Continuation Approval,Change with Same Employer Approval,New Concurrent Approval,Change of Employer Approval,Amended Approval,Cap Fiscal Year,Cap Type,Work Site State,Hourly Wage
547,2016.0,,NORTH CHICAGO,IL,60064.0,1,1,ABBOTT LABORATORIES,h1b,,...,,,,,,,,,,
548,2016.0,31-33 - Manufacturing,ABBOT PARK,IL,60064.0,0,1,ABBOTT LABORATORIES,h1b,,...,,,,,,,,,,
549,2016.0,31-33 - Manufacturing,ABBOTT PARK,IL,60064.0,2,41,ABBOTT LABORATORIES,h1b,,...,,,,,,,,,,
550,2016.0,31-33 - Manufacturing,NORTH CHICAGO,IL,60064.0,0,5,ABBOTT LABORATORIES,h1b,,...,,,,,,,,,,
694,2016.0,"54 - Professional, Scientific, and Technical S...",SAN DIEGO,CA,92130.0,1,1,ACADIA PHARMACEUTICALS INC,h1b,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69693,,,FREDERICK,OK,73542.0,,,CHS INC,h2a,2020.0,...,16,0,0,0,0,0.0,,,,
3876,,72 - Accommodation and Food Services,Stanford,KY,40484.0,,,UTG INC,h2b,,...,1,0.0,0.0,0.0,0.0,0.0,2024.0,1st Half,KY,$20.00 and more
9245,,56 - Administrative and Support and Waste Mana...,Raleigh,NC,27605.0,,,GREEN LEAF INNOVATIONS INC,h2b,,...,0,0.0,0.0,0.0,11.0,0.0,2024.0,Exempt,NC,$14.00 - $15.99
9246,,56 - Administrative and Support and Waste Mana...,Raleigh,NC,27605.0,,,GREEN LEAF INNOVATIONS INC,h2b,,...,15,0.0,0.0,0.0,0.0,0.0,2024.0,Exempt,NC,$14.00 - $15.99
