# Scrape Visa Requirement -- US Citizen

## 1. Dependencies

In [3]:
import pandas as pd
from matplotlib import pyplot as plt

from splinter import Browser
from bs4 import BeautifulSoup as bs

import pymongo

In [5]:
# get chromedriver location
!which chromedriver

/usr/local/bin/chromedriver


# 2. Scrap Data

## 2.1 Launch Driver

In [6]:
# launch chromedriver -- get an empty page
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## 2.2 Scrape Wiki Page

In [7]:
url = "https://en.wikipedia.org/wiki/Visa_requirements_for_United_States_citizens"
browser.visit(url)

In [94]:
tables = pd.read_html(browser.html)
visa_df = tables[0]

In [95]:
# drop the unnecesary cols
col_to_drop = ['Allowed stay']
visa_df.drop(col_to_drop, axis = 1, inplace = True)

# rename cols
visa_df.rename(columns={'Country':'country',
                        "Visa requirement":'visa_requirement',
                        "Notes (excluding departure fees)":'notes'},
              inplace = True)
visa_df.head()

Unnamed: 0,country,visa_requirement,notes
0,Afghanistan,Visa required[2][3],Visitors born in Afghanistan do not require a ...
1,Albania,Visa not required[5][6],
2,Algeria,Visa required[8][9],Persons may be denied entry if entering with a...
3,Andorra,Visa not required[10],
4,Angola,eVisa[13][14][15],Visitors who have been granted an online pre-v...


In [96]:
# clean up the citation marks
for i in range(len(visa_df)):
    visa_df.iloc[i,1] = visa_df.iloc[i,1].split('[')[0]

In [97]:
visa_df['visa_requirement'].value_counts()

Visa not required                  107
Visa required                       30
Visa on arrival                     19
eVisa / Visa on arrival             18
eVisa                               11
Travel restricted                    1
Entry Permit on arrival              1
Visitor's Permit on arrival          1
Visitor's permit on arrival          1
eVisa / Tourist card on arrival      1
Tourist Card required                1
Electronic Travel Authority          1
Online Visa                          1
Name: visa_requirement, dtype: int64

In [98]:
auto_visa_key_words = ["on arrival", 'eVisa', 'Electronic', 'Online']
restrict_list = ["Tourist Card required", "Travel restricted"]

for i in range(len(visa_df)):
    for word in auto_visa_key_words:
        if word in visa_df['visa_requirement'][i]:
            visa_df['visa_requirement'][i] = "eVisa/Visa on arrival"
            continue
    if visa_df['visa_requirement'][i] in restrict_list:
        visa_df['visa_requirement'][i] = "Travel restricted"

In [99]:
visa_df['visa_requirement'].value_counts()

Visa not required        107
eVisa/Visa on arrival     54
Visa required             30
Travel restricted          2
Name: visa_requirement, dtype: int64

In [87]:
visa_df.head()

Unnamed: 0,country,visa_requirement,notes
0,Afghanistan,Visa required,Visitors born in Afghanistan do not require a ...
1,Albania,Visa not required,
2,Algeria,Visa required,Persons may be denied entry if entering with a...
3,Andorra,Visa not required,
4,Angola,eVisa/Visa on arrival,Visitors who have been granted an online pre-v...
