# US Immigration Analytics
### Data Engineering Capstone Project

#### Introduction
This notebook contains some analytics queries performed on the data.

#### Conceptual Data Model

![Conceptual Data Model](images/conceptual_data_model.png)

In [1]:
import boto3
import configparser
import pandas as pd
import json
import os

#### Read Configuration

In [3]:
config = configparser.ConfigParser()

#Normally this file should be in ~/.aws/credentials
config.read_file(open('aws/credentials.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

BUCKET_NAME            = config.get("S3", "BUCKET_NAME")

DWH_ENDPOINT           = config.get("RedShift","DWH_ENDPOINT")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME","BUCKET_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME,BUCKET_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole
9,BUCKET_NAME,capstone-staging-area


#### Connect to Redshift

In [4]:
# Make sure you can connect to the cluster
%load_ext sql
from time import time
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
#print(conn_string)
%sql $conn_string

'Connected: dwhuser@dwh'

#### Run Analysis Queries

In [5]:
from sqlalchemy import create_engine
import pandas as pd
# Test connection to Redshift using Pandas
engine = create_engine('postgresql://'+DWH_DB_USER+':'+DWH_DB_PASSWORD+'@'+DWH_ENDPOINT+':'+DWH_PORT+'/'+DWH_DB)
data_frame = pd.read_sql('SELECT * FROM airport_codes LIMIT 2', engine)
data_frame

Unnamed: 0,ident,name,iso_country,iso_region,municipality,gps_code,iata_code,local_code
0,00A,Total Rf Heliport,US,US-PA,Bensalem,00A,,00A
1,00CL,Williams Ag Airport,US,US-CA,Biggs,00CL,,00CL


#### What is the most target state for people immigrating from Egypt?

In [10]:
pdf1 = pd.read_sql("""
                    SELECT t2.state_name, count(*)
                    FROM i94_immigration t1
                    INNER JOIN states t2 ON t1.i94addr = t2.state_code 
                    INNER JOIN code_country t3 ON NVL(t1.i94cit,t1.i94res) = t3.code
                    WHERE UPPER(t3.country) = 'EGYPT'
                    GROUP BY t2.state_name
                    ORDER BY count(*) DESC
                    LIMIT 5
                    """, engine)
pdf1

Unnamed: 0,state_name,count
0,NEW YORK,22588
1,CALIFORNIA,15562
2,FLORIDA,10068
3,NEW JERSEY,5441
4,TEXAS,4391


#### What is the gender of most of the immigrants?

In [12]:
pdf2 = pd.read_sql("""
                    SELECT gender, count(*)
                    FROM i94_immigration
                    GROUP BY gender
                    ORDER BY count(*) DESC
                    """, engine)
pdf2

Unnamed: 0,gender,count
0,M,18504505
1,F,18176125
2,,4079983
3,U,18906
4,X,11010


#### What are the different reasons for immigrants to go to US with respect to their gender?

In [16]:
pdf3 = pd.read_sql("""
                    SELECT t1.gender,t2.visa_category, count(*)
                    FROM i94_immigration t1
                    INNER JOIN visa_category t2 ON t1.i94visa = t2.code
                    GROUP BY t1.gender,t2.visa_category
                    ORDER BY t1.gender,t2.visa_category
                    """, engine)
pdf3

Unnamed: 0,gender,visa_category,count
0,F,Business,1056725
1,F,Pleasure,16405098
2,F,Student,714302
3,M,Business,3519930
4,M,Pleasure,14126980
5,M,Student,857595
6,U,Business,647
7,U,Pleasure,18224
8,U,Student,35
9,X,Business,215


#### How most of the immigrants arrive at US?

In [18]:
pdf4 = pd.read_sql("""
                    SELECT t2.mode, count(*)
                    FROM i94_immigration t1
                    INNER JOIN port_mode t2 ON t1.i94mode = t2.code
                    GROUP BY t2.mode
                    ORDER BY count(*) DESC
                    """, engine)
pdf4

Unnamed: 0,mode,count
0,Air,39166088
1,Land,1095001
2,Sea,387184
3,Not reported,142256


#### Which state most of the immigrants arrive at?

In [21]:
pdf5 = pd.read_sql("""
                    SELECT t2.port, count(*)
                    FROM i94_immigration t1
                    INNER JOIN ports t2 ON t1.i94port = t2.code
                    GROUP BY t2.port
                    ORDER BY count(*) DESC
                    LIMIT 5
                    """, engine)
pdf5

Unnamed: 0,port,count
0,NEW YORK,6678555
1,MIAMI,5122889
2,LOS ANGELES,4602847
3,SAN FRANCISCO,2309621
4,HONOLULU,2249967


#### What states targeted by immigrants who live in countries with average tempreture above 21?

In [32]:
pdf6 = pd.read_sql("""
                        SELECT t4.state_name, count(*)
                        FROM i94_immigration t1
                        INNER JOIN code_country t2 ON NVL(t1.i94cit,t1.i94res) = t2.code
                        INNER JOIN world_temp t3 ON UPPER(t2.country) = UPPER(t3.country) AND t3.avgtemp > 21
                        INNER JOIN states t4 ON t1.i94addr = t4.state_code 
                        GROUP BY t4.state_name
                        ORDER BY count(*) DESC
                        LIMIT 5
                    """, engine)
pdf6

Unnamed: 0,state_name,count
0,CALIFORNIA,196054996
1,FLORIDA,188196898
2,NEW YORK,126054016
3,TEXAS,76178157
4,NEW JERSEY,53648489


#### What states targeted by immigrants who live in countries with average tempreture less than 21?

In [33]:
pdf7 = pd.read_sql("""
                        SELECT t4.state_name, count(*)
                        FROM i94_immigration t1
                        INNER JOIN code_country t2 ON NVL(t1.i94cit,t1.i94res) = t2.code
                        INNER JOIN world_temp t3 ON UPPER(t2.country) = UPPER(t3.country) AND t3.avgtemp < 21
                        INNER JOIN states t4 ON t1.i94addr = t4.state_code 
                        GROUP BY t4.state_name
                        ORDER BY count(*) DESC
                        LIMIT 5
                    """, engine)
pdf7

Unnamed: 0,state_name,count
0,CALIFORNIA,512496489
1,NEW YORK,337520921
2,HAWAII,306942969
3,FLORIDA,303806932
4,TEXAS,89181042


#### What is the most week in the year that has the most immigrants arriving?

In [36]:
pdf8 = pd.read_sql("""
                        SELECT t2.week, count(*)
                        FROM i94_immigration t1
                        INNER JOIN i94_dates t2 ON t1.arrdate = t2.full_date
                        GROUP BY t2.week
                        ORDER BY count(*) DESC
                        LIMIT 4
                    """, engine)
pdf8

Unnamed: 0,week,count
0,31,1012313
1,29,982410
2,28,978018
3,30,975758


#### What is the most month in the year that has the most immigrants arriving?

In [37]:
pdf8 = pd.read_sql("""
                        SELECT t2.month, count(*)
                        FROM i94_immigration t1
                        INNER JOIN i94_dates t2 ON t1.arrdate = t2.full_date
                        GROUP BY t2.month
                        ORDER BY count(*) DESC
                        LIMIT 4
                    """, engine)
pdf8

Unnamed: 0,month,count
0,7,4265031
1,8,4103570
2,9,3733786
3,10,3649136


#### Which airport most of the immigrants arrive at?

In [40]:
pdf9 = pd.read_sql("""
                        SELECT t2.name, count(*)
                        FROM i94_immigration t1
                        INNER JOIN airport_codes t2 ON t1.airline = NVL(t2.iata_code,t2.local_code)
                        GROUP BY t2.name
                        ORDER BY count(*) DESC
                        LIMIT 4
                    """, engine)
pdf9

Unnamed: 0,name,count
0,Darke County Airport,321739
1,Timbuktu Airport,87316
2,AraÃ§atuba Airport,45836
3,Arufi Airstrip,45836


#### How many immigrants arrive at states with the highest population?

In [50]:
pdf10 = pd.read_sql("""
                        SELECT t2.state, count(*)
                        FROM i94_immigration t1
                        INNER JOIN (
                                SELECT state_code,state,sum(total_population) as state_total_population
                                FROM us_cities_demographics
                                GROUP BY state_code,state
                                ORDER BY sum(total_population) desc
                                LIMIT 3
                        ) t2 ON t1.i94addr = t2.state_code
                        GROUP BY t2.state
                        ORDER BY count(*) DESC
                        LIMIT 3
                    """, engine)
pdf10

Unnamed: 0,state,count
0,New York,6764396
1,California,6531491
2,Texas,1690521


#### How many immigrants arrive at states with the lowest population?

In [51]:
pdf11 = pd.read_sql("""
                        SELECT t2.state, count(*)
                        FROM i94_immigration t1
                        INNER JOIN (
                                SELECT state_code,state,sum(total_population) as state_total_population
                                FROM us_cities_demographics
                                GROUP BY state_code,state
                                ORDER BY sum(total_population) asc
                                LIMIT 3
                        ) t2 ON t1.i94addr = t2.state_code
                        GROUP BY t2.state
                        ORDER BY count(*) DESC
                        LIMIT 3
                    """, engine)
pdf11

Unnamed: 0,state,count
0,Maine,49921
1,Delaware,40636
2,Montana,33805
