In [1]:
# This code sets up display options, imports, etc.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats

import plotly.express as px
import plotly.io as pio

# set up plotting defaults
%matplotlib inline
set_matplotlib_formats("svg")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (4, 3)
pio.templates.default = "simple_white"

# display options for numpy and pandas
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("precision", 2)

# Lab 1: Pandas and Data Cleaning

**Data Science Bootcamp with Python, Pandas, and Plotly**

Feb 21, 2023

## Working with data tables in `pandas`

### Subsetting and Filtering: NBA Salaries

In [2]:
import pandas as pd
import numpy as np

This code loads in salaries for NBA players in the '21-'22 season.

In [52]:
def data(csv):
    return f'https://github.com/DS-100/textbook/blob/master/content/datasets/{csv}?raw=true'

data('nba-2022.csv')

In [54]:
salaries = pd.read_csv(data('nba-2022.csv'))
salaries

### What questions can we answer with this data?

- Fill in this cell with your answer.

### Aggregating: Dog Breed Data

- Fill in this cell with your answer.

In [55]:
dogs = ...
dogs

### Putting it all Together: Baby Names

#### Lilith, Lilibet … Lucifer? How Baby Names Went to ‘L’

https://archive.is/NpORG

### What are the claims made in the article?

- Fill in this cell with your answer.

In [56]:
baby = pd.read_csv(data('babynames.csv'))
baby

In [57]:
nyt = pd.read_csv(data('nyt_names.csv'))
nyt

### Recap: Data table manipulation

- Fill in this cell with your notes.

## Data Cleaning

### CO2 Measurements on Mauna Loa

www.esrl.noaa.gov/gmd/ccgg/trends/

In [48]:
!wget --output-document "co2_mm_mlo.txt" "https://github.com/DS-100/textbook/blob/master/content/datasets/co2_mm_mlo.txt?raw=true" 2> /dev/null

In [49]:
!head co2_mm_mlo.txt

In [12]:
from pathlib import Path

lines = Path('co2_mm_mlo.txt').read_text().split('\n')
len(lines)

In [35]:
lines[:6]

In [14]:
lines[69:79]

In [15]:
co2 = pd.read_csv('co2_mm_mlo.txt', 
                  header=None, skiprows=72, sep='\s+',
                  names=['Yr', 'Mo', 'DecDate', 'Avg', 'Int', 'Trend', 'days'])
co2.head()

### Missing Data

In [20]:
px.line(co2, x='DecDate', y='Avg')

In [22]:
px.histogram(co2, x='days')

In [23]:
px.scatter(co2, x='Yr', y='days')

In [24]:
px.histogram(co2, x='Avg')

In [28]:
co2[co2["Avg"] < 0]

In [41]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=3,
    subplot_titles=("Missing Dropped", 
                    "Missing Set to NaN", "Missing Interpolated"))

co2_small = (co2[co2['Avg'] > 0])[:8] 

fig.add_trace(
    go.Scatter(x=co2_small['Mo'], y=co2_small['Avg'], 
               mode='lines+markers'),
    row=1, col=1)

co2_na = (co2.replace(-99.99, np.NaN))[:10]

fig.add_trace(
    go.Scatter(x=co2_na['Mo'], y=co2_na['Avg'], mode='lines+markers'),
    row=1, col=2)

fig.add_trace(
    go.Scatter(x=co2_na['Mo'], y=co2_na['Int'], mode='lines+markers'),
    row=1, col=3)

fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=1)
fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=2)
fig.update_xaxes(tickvals = np.arange(3, 13, 1), row=1, col=3)

fig.update_layout(height=350, margin=dict(t=30))

fig.show()

### Granularity

In [42]:
co2

In [44]:
fig = px.line(co2.groupby('Yr')['Int'].mean())
fig.update_layout(showlegend=False)
fig.show()

### Data Cleaning Principles

- Check data quality
- Look for missing values
- Transform data
- Modify structure

## Case Study: Restaurant Safety Violations

https://data.sfgov.org/Health-and-Social-Services/Restaurant-Scores-LIVES-Standard/pyih-qa8i

In [68]:
# businesses
bus = pd.read_csv(data('SF_Restaurant_Inspections/businesses.csv'), encoding='ISO-8859-1')

# inspections
insp = pd.read_csv(data('SF_Restaurant_Inspections/inspections.csv'))

# violations
viol = pd.read_csv(data('SF_Restaurant_Inspections/violations.csv'))

In [69]:
bus

In [70]:
date_format = '%Y%m%d'

def parse_dates_and_years(df, column='date'):
    dates = pd.to_datetime(df[column], format=date_format)
    years = dates.dt.year
    return df.assign(timestamp=dates, year=years)

def subset_2016(df):
    return df.query('year == 2016')

# inspections
ins2016 = (insp
           .pipe(parse_dates_and_years)
           .pipe(subset_2016))

# violations
vio2016 = (viol
           .pipe(parse_dates_and_years)
           .pipe(subset_2016))

In [71]:
num_vios = (vio2016
            .groupby(['business_id', 'timestamp'])
            .size()
            .reset_index()
            .rename(columns={0: 'num_vio'}));
num_vios

In [75]:
def left_join_vios(df):
    return df.merge(num_vios, on=['business_id', 'timestamp'], how='left')

def zero_vios_for_perfect_scores(df):
    df = df.copy()
    df.loc[df['score'] == 100, 'num_vio'] = 0
    return df

ins_and_num_vios = (ins2016.pipe(left_join_vios)
                    .pipe(zero_vios_for_perfect_scores))
ins_and_num_vios

In [76]:
ins_and_num_vios['num_vio'].isnull().sum()

In [78]:
px.strip(ins_and_num_vios, x="num_vio", y="score")

In [83]:
vio2016['description'].value_counts().head(20).to_frame()

In [88]:
from IPython.display import display
with pd.option_context("display.max_rows", 15):
    display(vio2016['description'].value_counts().head(15).to_frame())

### Do certain kinds of violations impact safety score more than others?

In [90]:
def make_vio_categories(vio):
    def has(term):
        return vio['description'].str.contains(term)

    return vio[['business_id', 'timestamp']].assign(
        high_risk        = has(r"high risk"),
        clean            = has(r"clean|sanit"),
        food_surface     = (has(r"surface") & has(r"\Wfood")),
        vermin           = has(r"vermin"),
        storage          = has(r"thaw|cool|therm|storage"),
        permit           = has(r"certif|permit"),
        non_food_surface = has(r"wall|ceiling|floor|surface"),
        human            = has(r"hand|glove|hair|nail"),
    )

In [91]:
# The violations with the new categories.
# `vio_ctg` is short for `violation categories`.
vio_ctg = vio2016.pipe(make_vio_categories)
vio_ctg

In [92]:
vio_counts = vio_ctg.groupby(['business_id', 'timestamp']).sum().reset_index()
vio_counts

In [93]:
feature_names = ['high_risk', 'clean', 'food_surface', 'vermin',
                 'storage', 'permit', 'non_food_surface', 'human']

def left_join_features(ins):
    return (ins[['business_id', 'timestamp', 'score']]
            .merge(vio_counts, on=['business_id', 'timestamp'], how='left'))

def zero_features_for_perfect_scores(ins):
    ins = ins.copy()
    ins.loc[ins['score'] == 100, feature_names] = 0
    return ins

In [94]:
ins_and_vios = (ins2016.pipe(left_join_features)
                .pipe(zero_features_for_perfect_scores))
ins_and_vios.head(3)

In [95]:
px.box(ins_and_vios, x='high_risk', y='score')

In [96]:
px.box(ins_and_vios, x='vermin', y='score')