## Christian Campbell - Milestone 3

In [1]:
import requests
import re
import ssl
from bs4 import BeautifulSoup
import urllib.request, urllib.parse
from urllib.error import HTTPError, URLError
import socket
import pandas as pd
import numpy as np

In [2]:
crime_rate = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_violent_crime_rate"
response = requests.get(crime_rate)

In [3]:
def status_check(r):
        if r.status_code==200:
            print("success!")
            return 1
        else:
                print("Failed!")
                return -1

In [4]:
status_check(response)

success!


1

In [5]:
def encoding_check(r):
    return (r.encoding)

encoding_check(response)

'UTF-8'

In [6]:
def decode_content(r, encoding):
    return (r.content.decode(encoding))

contents = decode_content(response, encoding_check(response))
soup = BeautifulSoup(contents, "html.parser")

In [7]:
crime_rate = soup.find("table", {"class": "wikitable"})
crime_rate

<table class="wikitable sortable sticky-header static-row-numbers sort-under col1left" style="text-align:right">
<tbody><tr>
<th>Location</th>
<th>Violent <br/> crime</th>
<th>Homicide</th>
<th>Rape</th>
<th>Robbery</th>
<th><abbr title="Aggravated assault">Ag. <br/> Assault</abbr>
</th></tr>
<tr class="static-row-numbers-norank">
<td><span class="flagicon" style="padding-left:25px;"> </span><b>United States</b></td>
<td>380.7</td>
<td>6.3</td>
<td>40.0</td>
<td>66.1</td>
<td>268.2
</td></tr>
<tr class="static-row-numbers-norank">
<td style="text-align:left"><span class="flagicon" style="display:inline-block;width:25px;text-align:left"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="1000" data-file-width="2000" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d4/Flag_of_the_District_of_Columbia.svg/23px-Flag_of_the_District_of_Columbia.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/t

In [8]:
row_headers = []
for x in crime_rate.find_all("tr"):
    for y in x.find_all("th"):
        row_headers.append(y.text)
row_headers

['Location', 'Violent  crime', 'Homicide', 'Rape', 'Robbery', 'Ag.  Assault\n']

In [9]:
table_values = []
for x in crime_rate.find_all("tr")[1:]:
    td_tags = x.find_all("td")
    td_val = [y.text for y in td_tags]
    table_values.append(td_val)
table_values

[['\xa0United States', '380.7', '6.3', '40.0', '66.1', '268.2\n'],
 ['\xa0District of Columbia', '812.3', '29.3', '41.5', '357.5', '383.9\n'],
 ['\xa0New Mexico', '780.5', '12.0', '54.6', '110.6', '603.3\n'],
 ['\xa0Alaska', '758.9', '9.5', '134.0', '75.1', '540.2\n'],
 ['\xa0Arkansas', '645.3', '10.2', '76.0', '39.7', '519.4\n'],
 ['\xa0Louisiana', '628.6', '16.1', '43.0', '67.3', '502.1\n'],
 ['\xa0Tennessee', '621.6', '8.6', '38.2', '67.1', '507.6\n'],
 ['\xa0California', '499.5', '5.7', '37.4', '123.5', '332.8\n'],
 ['\xa0Colorado', '492.5', '6.4', '63.4', '72.6', '350.1\n'],
 ['\xa0South Carolina', '491.3', '11.2', '38.2', '40.6', '401.3\n'],
 ['\xa0Missouri', '488.0', '10.1', '48.9', '54.8', '374.2\n'],
 ['\xa0Michigan', '461.0', '6.9', '64.8', '36.6', '352.7\n'],
 ['\xa0Nevada', '454.0', '6.8', '58.9', '86.1', '302.3\n'],
 ['\xa0Texas', '431.9', '6.7', '50.0', '70.5', '304.7\n'],
 ['\xa0Arizona', '431.5', '6.8', '44.1', '70.1', '310.5\n'],
 ['\xa0New York', '429.3', '4.0', '29.5

In [10]:
og_crime_rate = pd.DataFrame(table_values, columns = row_headers)
og_crime_rate

Unnamed: 0,Location,Violent crime,Homicide,Rape,Robbery,Ag. Assault\n
0,United States,380.7,6.3,40.0,66.1,268.2\n
1,District of Columbia,812.3,29.3,41.5,357.5,383.9\n
2,New Mexico,780.5,12.0,54.6,110.6,603.3\n
3,Alaska,758.9,9.5,134.0,75.1,540.2\n
4,Arkansas,645.3,10.2,76.0,39.7,519.4\n
5,Louisiana,628.6,16.1,43.0,67.3,502.1\n
6,Tennessee,621.6,8.6,38.2,67.1,507.6\n
7,California,499.5,5.7,37.4,123.5,332.8\n
8,Colorado,492.5,6.4,63.4,72.6,350.1\n
9,South Carolina,491.3,11.2,38.2,40.6,401.3\n


##### 1.

In [11]:
## In this first step, I'll be removing the "\n" text from all the values in the far right column.

In [12]:
step1_crime_rate = og_crime_rate.replace('\n', '', regex=True)
step1_crime_rate

Unnamed: 0,Location,Violent crime,Homicide,Rape,Robbery,Ag. Assault\n
0,United States,380.7,6.3,40.0,66.1,268.2
1,District of Columbia,812.3,29.3,41.5,357.5,383.9
2,New Mexico,780.5,12.0,54.6,110.6,603.3
3,Alaska,758.9,9.5,134.0,75.1,540.2
4,Arkansas,645.3,10.2,76.0,39.7,519.4
5,Louisiana,628.6,16.1,43.0,67.3,502.1
6,Tennessee,621.6,8.6,38.2,67.1,507.6
7,California,499.5,5.7,37.4,123.5,332.8
8,Colorado,492.5,6.4,63.4,72.6,350.1
9,South Carolina,491.3,11.2,38.2,40.6,401.3


##### 2.

In [13]:
# In this second step I'll be removing the \n value from the Ag.Assault header to make it more readabe.

In [14]:
step1_crime_rate.rename({"Ag.  Assault\n": "Aggrevated Assault"}, axis=1, inplace=True)
step2_crime_rate = step1_crime_rate.copy()
step2_crime_rate

Unnamed: 0,Location,Violent crime,Homicide,Rape,Robbery,Aggrevated Assault
0,United States,380.7,6.3,40.0,66.1,268.2
1,District of Columbia,812.3,29.3,41.5,357.5,383.9
2,New Mexico,780.5,12.0,54.6,110.6,603.3
3,Alaska,758.9,9.5,134.0,75.1,540.2
4,Arkansas,645.3,10.2,76.0,39.7,519.4
5,Louisiana,628.6,16.1,43.0,67.3,502.1
6,Tennessee,621.6,8.6,38.2,67.1,507.6
7,California,499.5,5.7,37.4,123.5,332.8
8,Colorado,492.5,6.4,63.4,72.6,350.1
9,South Carolina,491.3,11.2,38.2,40.6,401.3


##### 3.

In [15]:
# In this third step, I'll be deleting the first row labelled United Stated. Since I've chosen to deal with the individual 
# states, I don't need that United States row.

In [16]:
step3_crime_rate = step2_crime_rate.drop(step2_crime_rate.index[0]).copy()
step3_crime_rate

Unnamed: 0,Location,Violent crime,Homicide,Rape,Robbery,Aggrevated Assault
1,District of Columbia,812.3,29.3,41.5,357.5,383.9
2,New Mexico,780.5,12.0,54.6,110.6,603.3
3,Alaska,758.9,9.5,134.0,75.1,540.2
4,Arkansas,645.3,10.2,76.0,39.7,519.4
5,Louisiana,628.6,16.1,43.0,67.3,502.1
6,Tennessee,621.6,8.6,38.2,67.1,507.6
7,California,499.5,5.7,37.4,123.5,332.8
8,Colorado,492.5,6.4,63.4,72.6,350.1
9,South Carolina,491.3,11.2,38.2,40.6,401.3
10,Missouri,488.0,10.1,48.9,54.8,374.2


##### 4.

In [17]:
# In this fourth step I'll be sorting the Location column in ascending alphabetical order.

In [18]:
step4_crime_rate = step3_crime_rate.sort_values(by='Location').copy()
step4_crime_rate

Unnamed: 0,Location,Violent crime,Homicide,Rape,Robbery,Aggrevated Assault
19,Alabama,409.1,10.9,29.6,34.5,334.1
3,Alaska,758.9,9.5,134.0,75.1,540.2
14,Arizona,431.5,6.8,44.1,70.1,310.5
4,Arkansas,645.3,10.2,76.0,39.7,519.4
7,California,499.5,5.7,37.4,123.5,332.8
8,Colorado,492.5,6.4,63.4,72.6,350.1
49,Connecticut,150.0,3.8,18.1,44.9,83.3
22,Delaware,383.5,4.8,22.0,57.0,299.8
1,District of Columbia,812.3,29.3,41.5,357.5,383.9
39,Florida,258.9,5.0,30.2,33.6,190.1


##### 5.

In [19]:
# For the fifth step, I'm goin to add "rate" to each header. This will differentiate the columns in this table
# from the other column headers when I eventually mereg my tables in milestone 5.

In [20]:
step4_crime_rate.rename({"Violent  crime": "Violent crime rate", "Homicide": "Homicide rate", "Rape": "Rape rate", "Robbery": "Robbery rate", "Aggrevated Assault": "Aggrevated Assault rate"}, axis=1, inplace=True)
final_crime_rate = step4_crime_rate.copy()
final_crime_rate

Unnamed: 0,Location,Violent crime rate,Homicide rate,Rape rate,Robbery rate,Aggrevated Assault rate
19,Alabama,409.1,10.9,29.6,34.5,334.1
3,Alaska,758.9,9.5,134.0,75.1,540.2
14,Arizona,431.5,6.8,44.1,70.1,310.5
4,Arkansas,645.3,10.2,76.0,39.7,519.4
7,California,499.5,5.7,37.4,123.5,332.8
8,Colorado,492.5,6.4,63.4,72.6,350.1
49,Connecticut,150.0,3.8,18.1,44.9,83.3
22,Delaware,383.5,4.8,22.0,57.0,299.8
1,District of Columbia,812.3,29.3,41.5,357.5,383.9
39,Florida,258.9,5.0,30.2,33.6,190.1


##### Short Paragraph

In [21]:
"""
I'd argue that none of the transformations I performed violate or raise any ethical implications. As was the case with the 
previous milestone, the data that was scraped from the website was pretty much well polished. I struggled to find things to
transform. 
"""

"\nI'd argue that none of the transformations I performed violate or raise any ethical implications. As was the case with the \nprevious milestone, the data that was scraped from the website was pretty much well polished. I struggled to find things to\ntransform. \n"

In [22]:
import pandas as pd
import sqlite3 as sql

In [23]:
conn = sql.connect('website_crime_data')

In [None]:
final_crime_rate.to_sql("website_crime_data", conn, index=False)

In [None]:
conn.close()

In [None]:
fig = px.bar(web_data, x='Location', y='Violent crime rate', title='Violent Crime Rate by Location')
fig.show()