In [48]:
import pandas as pd
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

In [49]:
# a dataframe is composed of series
# each series is a column
# but the df itself is more than a list of series
# the df itself has its own functionality on top
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [10]:
# df.column_name returns that specific series
df.name

0       Sally
1        Jane
2       Suzie
3       Billy
4         Ada
5        John
6      Thomas
7       Marie
8      Albert
9     Richard
10      Isaac
11       Alan
Name: name, dtype: object

In [11]:
# the dataframe itself has its own method and functionality
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [13]:
df.dtypes

name       object
math        int64
english     int64
reading     int64
dtype: object

In [14]:
df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [15]:
df.index

RangeIndex(start=0, stop=12, step=1)

In [19]:
# .describe provides summary stats on our numeric columns
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


## Why DataFrames?
- Rectangular data consisting of columns and rows
- Dataframes enjoy functionality above/beyond series
- Dataframes are a container for series


In [50]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [51]:
# How to access a specific column
# df.column_name
# df["column_name"]
df.math.head()

0    62
1    88
2    94
3    98
4    77
Name: math, dtype: int64

In [52]:
df["math"].head()

0    62
1    88
2    94
3    98
4    77
Name: math, dtype: int64

In [53]:
# Making a new column from scratch, we'll need to use the ["column_name"] syntax
df["cohort"] = "Jemison"
df

Unnamed: 0,name,math,english,reading,cohort
0,Sally,62,85,80,Jemison
1,Jane,88,79,67,Jemison
2,Suzie,94,74,95,Jemison
3,Billy,98,96,88,Jemison
4,Ada,77,92,98,Jemison
5,John,79,76,93,Jemison
6,Thomas,82,64,81,Jemison
7,Marie,93,63,90,Jemison
8,Albert,92,62,87,Jemison
9,Richard,69,80,94,Jemison


In [57]:
# To make a new column
# df["column_name"] = value
df["campus"] = "San Antonio"

In [59]:
# Show the students who are making As in math
# df.math >= 90 returns a series of booleans 
df.math >= 90

0     False
1     False
2      True
3      True
4     False
5     False
6     False
7      True
8      True
9     False
10     True
11     True
Name: math, dtype: bool

## Once you have your array/Series of booleans, you have the keys to the castle
- we can filter our results
- `df[df.column > 90]`

In [60]:
# SELECT * from df where math >= 90
# The variable or expression that produces the array booleans
# goes in the brackets to filter our results
df[df.math >= 90]

Unnamed: 0,name,math,english,reading,cohort,campus
2,Suzie,94,74,95,Jemison,San Antonio
3,Billy,98,96,88,Jemison,San Antonio
7,Marie,93,63,90,Jemison,San Antonio
8,Albert,92,62,87,Jemison,San Antonio
10,Isaac,92,99,93,Jemison,San Antonio
11,Alan,92,62,72,Jemison,San Antonio


In [61]:
# What if we need to store math >= 90 as its own column
# Let's build a new column
# is_math_honors
df["math_honors"] = df.math >= 90
df

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors
0,Sally,62,85,80,Jemison,San Antonio,False
1,Jane,88,79,67,Jemison,San Antonio,False
2,Suzie,94,74,95,Jemison,San Antonio,True
3,Billy,98,96,88,Jemison,San Antonio,True
4,Ada,77,92,98,Jemison,San Antonio,False
5,John,79,76,93,Jemison,San Antonio,False
6,Thomas,82,64,81,Jemison,San Antonio,False
7,Marie,93,63,90,Jemison,San Antonio,True
8,Albert,92,62,87,Jemison,San Antonio,True
9,Richard,69,80,94,Jemison,San Antonio,False


In [62]:
df["english_honors"] = df.english >= 90
df

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors,english_honors
0,Sally,62,85,80,Jemison,San Antonio,False,False
1,Jane,88,79,67,Jemison,San Antonio,False,False
2,Suzie,94,74,95,Jemison,San Antonio,True,False
3,Billy,98,96,88,Jemison,San Antonio,True,True
4,Ada,77,92,98,Jemison,San Antonio,False,True
5,John,79,76,93,Jemison,San Antonio,False,False
6,Thomas,82,64,81,Jemison,San Antonio,False,False
7,Marie,93,63,90,Jemison,San Antonio,True,False
8,Albert,92,62,87,Jemison,San Antonio,True,False
9,Richard,69,80,94,Jemison,San Antonio,False,False


In [64]:
df["reading_honors"] = df.reading >= 90
df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors,english_honors,reading_honors
0,Sally,62,85,80,Jemison,San Antonio,False,False,False
1,Jane,88,79,67,Jemison,San Antonio,False,False,False
2,Suzie,94,74,95,Jemison,San Antonio,True,False,True
3,Billy,98,96,88,Jemison,San Antonio,True,True,False
4,Ada,77,92,98,Jemison,San Antonio,False,True,True


In [66]:
# What about creating an all_honors column
# create a new column called all_honors
df["all_honors"] = df.english_honors & df.math_honors & df.reading_honors
df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors,english_honors,reading_honors,all_honors
0,Sally,62,85,80,Jemison,San Antonio,False,False,False,False
1,Jane,88,79,67,Jemison,San Antonio,False,False,False,False
2,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False
3,Billy,98,96,88,Jemison,San Antonio,True,True,False,False
4,Ada,77,92,98,Jemison,San Antonio,False,True,True,False


In [69]:
# Another way to produce our "all_honors" column
df["all_honors"] = (df.english >= 90) & (df.math >= 90) & (df.reading >= 90)
df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors,english_honors,reading_honors,all_honors
0,Sally,62,85,80,Jemison,San Antonio,False,False,False,False
1,Jane,88,79,67,Jemison,San Antonio,False,False,False,False
2,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False
3,Billy,98,96,88,Jemison,San Antonio,True,True,False,False
4,Ada,77,92,98,Jemison,San Antonio,False,True,True,False


In [70]:
df["overall_average"] = (df.math + df.english + df.reading) / 3
df.head()

Unnamed: 0,name,math,english,reading,cohort,campus,math_honors,english_honors,reading_honors,all_honors,overall_average
0,Sally,62,85,80,Jemison,San Antonio,False,False,False,False,75.666667
1,Jane,88,79,67,Jemison,San Antonio,False,False,False,False,78.0
2,Suzie,94,74,95,Jemison,San Antonio,True,False,True,False,87.666667
3,Billy,98,96,88,Jemison,San Antonio,True,True,False,False,94.0
4,Ada,77,92,98,Jemison,San Antonio,False,True,True,False,89.0


In [72]:
# to view only a specific set of columns:
columns = ["name", "math", "reading", "english"]
df[columns].head()

Unnamed: 0,name,math,reading,english
0,Sally,62,80,85
1,Jane,88,67,79
2,Suzie,94,95,74
3,Billy,98,88,96
4,Ada,77,98,92


In [73]:
# double brackets produce the same result as above w/o a variable
df[["name", "math", "reading", "english"]]

Unnamed: 0,name,math,reading,english
0,Sally,62,80,85
1,Jane,88,67,79
2,Suzie,94,95,74
3,Billy,98,88,96
4,Ada,77,98,92
5,John,79,93,76
6,Thomas,82,81,64
7,Marie,93,90,63
8,Albert,92,87,62
9,Richard,69,94,80


In [77]:
# How to drop columns
cols_to_drop = ["campus", "cohort"]

# Since drop doesn't change the original dataframe, we'll reassign the df
df = df.drop(columns=cols_to_drop)
df.head()

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
0,Sally,62,85,80,False,False,False,False,75.666667
1,Jane,88,79,67,False,False,False,False,78.0
2,Suzie,94,74,95,True,False,True,False,87.666667
3,Billy,98,96,88,True,True,False,False,94.0
4,Ada,77,92,98,False,True,True,False,89.0


In [79]:
# .tail peeks at the end of the df
df.tail()

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
7,Marie,93,63,90,True,False,True,False,82.0
8,Albert,92,62,87,True,False,False,False,80.333333
9,Richard,69,80,94,False,False,True,False,81.0
10,Isaac,92,99,93,True,True,True,True,94.666667
11,Alan,92,62,72,True,False,False,False,75.333333


In [81]:
# .sample samples the df randomly
df.sample(5)

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
9,Richard,69,80,94,False,False,True,False,81.0
5,John,79,76,93,False,False,True,False,82.666667
1,Jane,88,79,67,False,False,False,False,78.0
7,Marie,93,63,90,True,False,True,False,82.0
8,Albert,92,62,87,True,False,False,False,80.333333


In [82]:
# If you have a series/array of booleans, you can filter your results
df[df.name == "Jane"]

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
1,Jane,88,79,67,False,False,False,False,78.0


In [83]:
df[(df.name == "Jane") | (df.name == "Marie")]

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
1,Jane,88,79,67,False,False,False,False,78.0
7,Marie,93,63,90,True,False,True,False,82.0


In [84]:
# The | or & operators here return a series of booleans
(df.name == "Jane") | (df.name == "Marie")

0     False
1      True
2     False
3     False
4     False
5     False
6     False
7      True
8     False
9     False
10    False
11    False
Name: name, dtype: bool

## ORs and ANDs in programming
- ANDs limit our possibilities:
    - allergic to peanuts and shellfish and dairy and bell peppers
    - you limit your results
- ORs expand your possibilities
    - I'm good with pizza or pasta or salad or curry or sandwich or.....

In [85]:
# Only show folks who are passing english and math
df[(df.math >= 70) & (df.english >= 70)]

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
1,Jane,88,79,67,False,False,False,False,78.0
2,Suzie,94,74,95,True,False,True,False,87.666667
3,Billy,98,96,88,True,True,False,False,94.0
4,Ada,77,92,98,False,True,True,False,89.0
5,John,79,76,93,False,False,True,False,82.666667
10,Isaac,92,99,93,True,True,True,True,94.666667


In [87]:
# With an OR, it only takes one true to make the entire expression true
df[(df.math >= 70) | (df.english >= 70)]

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
0,Sally,62,85,80,False,False,False,False,75.666667
1,Jane,88,79,67,False,False,False,False,78.0
2,Suzie,94,74,95,True,False,True,False,87.666667
3,Billy,98,96,88,True,True,False,False,94.0
4,Ada,77,92,98,False,True,True,False,89.0
5,John,79,76,93,False,False,True,False,82.666667
6,Thomas,82,64,81,False,False,False,False,75.666667
7,Marie,93,63,90,True,False,True,False,82.0
8,Albert,92,62,87,True,False,False,False,80.333333
9,Richard,69,80,94,False,False,True,False,81.0


In [89]:
# The code or variable that produces the series of booleans goes in the square braces
# SELECT * from df where name startswith("S")
df[df.name.str.startswith("S")]

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
0,Sally,62,85,80,False,False,False,False,75.666667
2,Suzie,94,74,95,True,False,True,False,87.666667


In [94]:
# Sorting
df.sort_values(by="english").head() # defaults to ascending order

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
8,Albert,92,62,87,True,False,False,False,80.333333
11,Alan,92,62,72,True,False,False,False,75.333333
7,Marie,93,63,90,True,False,True,False,82.0
6,Thomas,82,64,81,False,False,False,False,75.666667
2,Suzie,94,74,95,True,False,True,False,87.666667


In [96]:
df.sort_values(by="english", ascending=False).head()

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
10,Isaac,92,99,93,True,True,True,True,94.666667
3,Billy,98,96,88,True,True,False,False,94.0
4,Ada,77,92,98,False,True,True,False,89.0
0,Sally,62,85,80,False,False,False,False,75.666667
9,Richard,69,80,94,False,False,True,False,81.0


In [100]:
# Sorting by multiple columns
df = df.sort_values(by=["english", "math", "reading"])
df

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
11,Alan,92,62,72,True,False,False,False,75.333333
8,Albert,92,62,87,True,False,False,False,80.333333
7,Marie,93,63,90,True,False,True,False,82.0
6,Thomas,82,64,81,False,False,False,False,75.666667
2,Suzie,94,74,95,True,False,True,False,87.666667
5,John,79,76,93,False,False,True,False,82.666667
1,Jane,88,79,67,False,False,False,False,78.0
9,Richard,69,80,94,False,False,True,False,81.0
0,Sally,62,85,80,False,False,False,False,75.666667
4,Ada,77,92,98,False,True,True,False,89.0


## Chaining methods on a dataframe
- Chaining ain't new: think of string method chaining
- As long as your method returns a dataframe, you can attach another df method

In [102]:
"bANANA".swapcase().lower().swapcase()

'BANANA'

In [106]:
# if a method returns a df, we can attach another df method onto that
df.sort_values(by="all_honors").sort_values(by="name").head(10).head(5)

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
4,Ada,77,92,98,False,True,True,False,89.0
11,Alan,92,62,72,True,False,False,False,75.333333
8,Albert,92,62,87,True,False,False,False,80.333333
3,Billy,98,96,88,True,True,False,False,94.0
10,Isaac,92,99,93,True,True,True,True,94.666667


In [None]:
# Methods execute from left to right
# Functions execute from inside out (right to left)

In [110]:
df[["name", "english"]].sort_values(by="english", ascending=False).head(10).english >= 90

10     True
3      True
4      True
0     False
9     False
1     False
5     False
2     False
6     False
7     False
Name: english, dtype: bool

In [111]:
df.head()

Unnamed: 0,name,math,english,reading,math_honors,english_honors,reading_honors,all_honors,overall_average
11,Alan,92,62,72,True,False,False,False,75.333333
8,Albert,92,62,87,True,False,False,False,80.333333
7,Marie,93,63,90,True,False,True,False,82.0
6,Thomas,82,64,81,False,False,False,False,75.666667
2,Suzie,94,74,95,True,False,True,False,87.666667


In [116]:
# Renaming columns using a dictionary
cols_to_rename = {
    "math": "math_grade",
    "reading": "reading_grade",
    "english": "english_grade"
}
cols_to_rename

{'math': 'math_grade', 'reading': 'reading_grade', 'english': 'english_grade'}

In [117]:
# df.rename(columns={})
# The dictionary has colunns to rename
df = df.rename(columns=cols_to_rename)
df

Unnamed: 0,name,math_grade,english_grade,reading_grade,math_honors,english_honors,reading_honors,all_honors,overall_average
11,Alan,92,62,72,True,False,False,False,75.333333
8,Albert,92,62,87,True,False,False,False,80.333333
7,Marie,93,63,90,True,False,True,False,82.0
6,Thomas,82,64,81,False,False,False,False,75.666667
2,Suzie,94,74,95,True,False,True,False,87.666667
5,John,79,76,93,False,False,True,False,82.666667
1,Jane,88,79,67,False,False,False,False,78.0
9,Richard,69,80,94,False,False,True,False,81.0
0,Sally,62,85,80,False,False,False,False,75.666667
4,Ada,77,92,98,False,True,True,False,89.0


In [121]:
type(pd.DataFrame({"name": students}))

pandas.core.frame.DataFrame