# Advanced Pandas

In [None]:
# known import statements
import pandas as pd
import sqlite3
import os

# new import statement
import numpy as np

In [None]:
# Get the Piazza data from 'piazza.db'

db_name = "piazza.db"
assert os.path.exists(db_name)
conn = sqlite3.connect(db_name)

def qry(sql):
    return pd.read_sql(sql, conn)

df = qry("""
    SELECT *
    FROM sqlite_master
    WHERE type='table'
""")
print(df.iloc[0]['sql'])

In [None]:
piazza_df = pd.read_sql("""
    SELECT *
    FROM piazza
""", conn)
piazza_df.head(5)

In [None]:
# Warmup 1: Set the student id column as the index


In [None]:
# Warmup 2a: Which 10 students post the most?


In [None]:
# Warmup 2b: Can you plot their number of posts as a bar graph? Be sure to label your axes!


In [None]:
# Warmup 2c: How about with their name rather than their student id?


In [None]:
# Warmup 3a: Which people had more than 10 answers? Include all roles.


In [None]:
# Warmup 3b: Plot this as a bar graph.


In [None]:
# Warmup 3c: Plot the contributions as a bar graph.


In [None]:
# Warmup 3d: Can you get this same data using SQL?
qry("""

""")

In [None]:
# Warmup 3e: What about their average # of days online as well?
qry("""

""")

In [None]:
# Warmup 3f: Can we do that in Pandas as well?


# Today's Learning Objectives: 

* Setting column as index for pandas `DataFrame`
* Identify, drop, or fill missing values (`np.NaN`) using Pandas `isna`, `dropna`, and `fillna`
* Applying transformations to `DataFrame`:
  * Use `apply` on pandas `Series` to apply a transformation function
  * Use `replace` to replace all target values in Pandas `Series` and `DataFrame` rows / columns
* Filter, aggregate, group, and summarize information in a `DataFrame` with `groupby`
* Convert .groupby examples to SQL
* Solving the same question using SQL and pandas `DataFrame` manipulations:
  * filtering, grouping, and aggregation / summarization

In [None]:
# Sort by name... What do we notice?


### Not a Number

- `np.NaN` is the floating point representation of Not a Number
- You do not need to know / learn the details about the `numpy` package 

### Replacing / modifying values within the `DataFrame`

Syntax: `df.replace(<TARGET>, <REPLACE>)`

Let's now replace the missing values (empty strings) with `np.NaN`

In [None]:
# Let's replace these empty strings with a special value.
piazza_df = ???
piazza_df

In [None]:
# Sort by name again... What do we notice?


### Checking for missing values

Syntax: `Series.isna()`
- Returns a boolean Series

In [None]:
# Run isna() on the name column


In [None]:
# How many people are missing a name?


In [None]:
# How many people are missing an email?


In [None]:
# How many people are missing both a name and email?


In [None]:
# How many people are missing either a name or email?


In [None]:
# So... What do we do?
#  1. Drop those rows
#  2. Interpolate / Best Guess

In [None]:
# Option 1: Drop those rows.


In [None]:
# Option 2a: Interpolate / Best Guess


In [None]:
# Create a function to take an email (e.g. "calm_star@wisc.edu")
# and return the name (e.g. "calm star")
def parse_name_from_email(email):
    if pd.isna(email):
        return np.nan
    else:
        pass # TODO Parse out the name!

# Test your function!
parse_name_from_email("calm_star@wisc.edu")

### Review: `Pandas.Series.apply(...)`
Syntax: `Series.apply(<FUNCTION OBJECT REFERENCE>)`
- applies input function to every element of the Series.
- Returns a new `Series`

In [None]:
# Now, apply that function to each value in email!
piazza_df["guessed_name"] = ???
piazza_df

In [None]:
# Create a function to take a name (e.g. "calm star")
# and return the email (e.g. "calm_star@wisc.edu")
def parse_email_from_name(name):
    pass

# Test your function!
parse_email_from_name("calm star")

In [None]:
# Now, apply that function to each value in name!
piazza_df["guessed_email"] = ???
piazza_df

### `Pandas.DataFrame.apply(...)`
Syntax: `DataFrame.apply(<FUNCTION OBJECT REFERENCE>, axis=1)`
- `axis=1` means apply to each row.
- returns a new `Series`

In [None]:
# If the name has a value, use it, otherwise use our best guess!
piazza_df["name"] = piazza_df.apply(lambda r : r["guessed_name"] if pd.isna(r["name"]) else r["name"], axis=1)

In [None]:
# Same thing for email!
piazza_df["email"] = piazza_df.apply(lambda r : r["guessed_email"] if pd.isna(r["email"]) else r["email"], axis=1)

In [None]:
# Drop the guessing columns
piazza_df = piazza_df.drop("guessed_name", axis=1)
piazza_df = piazza_df.drop("guessed_email", axis=1)

In [None]:
# How many rows are missing data now?
len(piazza_df.dropna())

In [None]:
# Give a name of "anonymous" and email of "anonymous@wisc.edu"
# to anyone with left with missing data.


### `Pandas.DataFrame.groupby(...)`

Syntax: `DataFrame.groupby(<COLUMN>)`
- Returns a `groupby` object
- Need to apply aggregation functions to use the return value of `groupby`

In [None]:
# What does this return?
piazza_df.groupby("role")

In [None]:
# Try getting the "mean" of this groupby object.


In [None]:
# How many answers does the average instructor, student, and TA give?


In [None]:
# How would we write this in SQL?
qry("""

""")

In [None]:
# What is the total number of days spent online for instructors, students, and TAs?
# Order your answer from lowest to highest


In [None]:
# How would we write this in SQL?
qry("""

""")

In [None]:
# Of those individuals who spend less than 100 days online,
# how does their average number of posts compare to those that
# spend 100 days or more online? Do your analysis by role as well.


In [None]:
# How would we write this in SQL?
qry("""

""")

In [None]:
# What percentage of instructors, students, and TAs did not write a single answer,
# followup, or reply to a followup?


In [None]:
# How would we write this in SQL?
qry("""

""")

In [None]:
conn.close()