In [1]:
# This code sets up display options, imports, etc.
!pip install matplotlib_inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats

import plotly.express as px
import plotly.io as pio

# set up plotting defaults
%matplotlib inline
set_matplotlib_formats("svg")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (4, 3)
pio.templates.default = "simple_white"

# display options for numpy and pandas
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("precision", 2)

# Lab 1: Pandas and Data Cleaning

**Data Science Bootcamp with Python, Pandas, and Plotly**

Feb 21, 2023

## Working with data tables in `pandas`

First, let's go over how to take notes.

### Subsetting and Filtering: NBA Salaries

In [2]:
import pandas as pd
import numpy as np

Let's load in salaries for NBA players in the '21-'22 season.

In [3]:
def data(csv):
    return f'https://github.com/DS-100/textbook/blob/master/content/datasets/{csv}?raw=true'

data('nba-2022.csv')

In [133]:
nba = ...
nba = nba.set_index('Player')
nba

### What questions can we answer with this data?

- Fill in this cell with your answer.

### Aggregating: Dog Breed Data

- Fill in this cell with your answer.

In [8]:
dogs = ...
dogs

### Putting it all Together: Baby Names

#### Lilith, Lilibet … Lucifer? How Baby Names Went to ‘L’

https://archive.is/NpORG

### What are the claims made in the article?

- Fill in this cell with your answer.

In [10]:
baby = ...
baby

In [11]:
nyt = ...
nyt

### Recap: Data table manipulation

- Fill in this cell with your notes.

## Data Cleaning

### CO2 Measurements on Mauna Loa

www.esrl.noaa.gov/gmd/ccgg/trends/

In [12]:
!wget --output-document "co2_mm_mlo.txt" "https://github.com/DS-100/textbook/blob/master/content/datasets/co2_mm_mlo.txt?raw=true" 2> /dev/null

In [13]:
!head co2_mm_mlo.txt

In [14]:
from pathlib import Path

lines = Path('co2_mm_mlo.txt').read_text().split('\n')
len(lines)

In [50]:
lines[:6]

In [16]:
lines[69:79]

In [17]:
...

### Missing Data

In [18]:
...

In [21]:
...

### You Try:

Get all the recordings with missing CO2 data (where the Avg column is less than 0)

In [22]:
...

In [65]:
# Don't worry about understanding this code for now, just look at the plots
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=3,
    subplot_titles=("Missing Dropped", 
                    "Missing Set to NaN", "Missing Interpolated"))

co2_small = (co2[co2['Avg'] > 0])[:8] 

fig.add_trace(
    go.Scatter(x=co2_small['Mo'], y=co2_small['Avg'], 
               mode='lines+markers'),
    row=1, col=1)

co2_na = (co2.replace(-99.99, np.NaN))[:10]

fig.add_trace(
    go.Scatter(x=co2_na['Mo'], y=co2_na['Avg'], mode='lines+markers'),
    row=1, col=2)

fig.add_trace(
    go.Scatter(x=co2_na['Mo'], y=co2_na['Int'], mode='lines+markers'),
    row=1, col=3)

fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=1)
fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=2)
fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=3)

fig.update_layout(height=350, margin=dict(t=30, b=0), showlegend=False)

fig.show()

### Granularity

In [24]:
co2

#### You Try:

Find the average CO2 for each year, then plot the CO2 vs. year

In [51]:
...

In [52]:
fig = ...
fig.update_layout(showlegend=False)
fig

### Data Cleaning Principles

- Check data quality
- Look for missing values
- Transform data
- Modify structure

## Case Study: Restaurant Safety Violations

https://data.sfgov.org/Health-and-Social-Services/Restaurant-Scores-LIVES-Standard/pyih-qa8i

In [27]:
# businesses
bus = pd.read_csv(data('SF_Restaurant_Inspections/businesses.csv'), encoding='ISO-8859-1')

# inspections
insp = pd.read_csv(data('SF_Restaurant_Inspections/inspections.csv'))

# violations
viol = pd.read_csv(data('SF_Restaurant_Inspections/violations.csv'))

In [99]:
bus

### Check Data Quality

#### You Try:

Write Python code to explore the `'postal_code'` column in `bus`. Find at least two issues.

In [98]:
...

In [54]:
...

In [101]:
...

In [105]:
...

### Look for Missing Data

#### You Try:

How many rows in `bus` have missing values for `'postal_code'`?

In [76]:
...

### Transform Data

In [108]:
insp 

https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes

In [121]:
dates = ...
dates

In [128]:
...

#### You Try:

Use `vio2016` to create a dataframe called `num_vios` that has one row per inspection. It should have three columns:

- `business_id`: The business ID.
- `timestamp`: The date of the inspection.
- `num_vio`: The total number of violations the inspection recorded on that date.

The resulting dataframe should have 4806 rows.

In [30]:
...

In [130]:
...

In [131]:
...

In [132]:
...

### Do certain kinds of violations impact safety score more than others?

#### You Try:

Display the top 15 most common violations.

In [53]:
...

In [35]:
from IPython.display import display
with pd.option_context("display.max_rows", 15):
    display(vio2016['description'].value_counts().head(15).to_frame())

In [36]:
def make_vio_categories(vio):
    def has(term):
        return vio['description'].str.contains(term)

    return vio[['business_id', 'timestamp']].assign(
        high_risk        = has(r"high risk"),
        clean            = has(r"clean|sanit"),
        food_surface     = (has(r"surface") & has(r"\Wfood")),
        vermin           = has(r"vermin"),
        storage          = has(r"thaw|cool|therm|storage"),
        permit           = has(r"certif|permit"),
        non_food_surface = has(r"wall|ceiling|floor|surface"),
        human            = has(r"hand|glove|hair|nail"),
    )

In [37]:
# The violations with the new categories.
# `vio_ctg` is short for `violation categories`.
vio_ctg = ...
vio_ctg

In [38]:
# Need to change granularity
vio_counts = ...
vio_counts

In [39]:
feature_names = ['high_risk', 'clean', 'food_surface', 'vermin',
                 'storage', 'permit', 'non_food_surface', 'human']

def left_join_features(ins):
    return (ins[['business_id', 'timestamp', 'score']]
            .merge(vio_counts, on=['business_id', 'timestamp'], how='left'))

def zero_features_for_perfect_scores(ins):
    ins = ins.copy()
    ins.loc[ins['score'] == 100, feature_names] = 0
    return ins

In [40]:
ins_and_vios = (ins2016.pipe(left_join_features)
                .pipe(zero_features_for_perfect_scores))
ins_and_vios.head(3)

In [41]:
...

In [42]:
...