## Intro to Dataframes

In [18]:
import pandas as pd

In [19]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], columns=['A','B','C'], index=['X','Y', 'Z'])
df.index.to_list()
df.index

Index(['X', 'Y', 'Z'], dtype='object')

In [20]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 96.0+ bytes


Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [21]:
df.shape

(3, 3)

## Accessing Data with Pandas

In [22]:
coffee = pd.read_csv('./warmup-data/coffee.csv')
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [23]:
results = pd.read_parquet('./data/results.parquet')
results.head()

Unnamed: 0,year,type,discipline,event,as,athlete_id,noc,team,place,tied,medal
0,1912.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,17.0,True,
1,1912.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jean Montariol,,False,
2,1920.0,Summer,Tennis,"Singles, Men (Olympic)",Jean-François Blanchy,1,FRA,,32.0,True,
3,1920.0,Summer,Tennis,"Doubles, Mixed (Olympic)",Jean-François Blanchy,1,FRA,Jeanne Vaussard,8.0,True,
4,1920.0,Summer,Tennis,"Doubles, Men (Olympic)",Jean-François Blanchy,1,FRA,Jacques Brugnon,4.0,False,


In [24]:
olympics_data = pd.read_excel('./data/olympics-data.xlsx')
olympics_data.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


In [25]:
coffee.sample(10, random_state=1)

Unnamed: 0,Day,Coffee Type,Units Sold
3,Tuesday,Latte,20
7,Thursday,Latte,30
6,Thursday,Espresso,40
2,Tuesday,Espresso,30
10,Saturday,Espresso,45
4,Wednesday,Espresso,35
1,Monday,Latte,15
12,Sunday,Espresso,45
0,Monday,Espresso,25
13,Sunday,Latte,35


In [26]:
coffee.loc[0]

Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

In [28]:
coffee.sort_values("Units Sold", ascending=True)

Unnamed: 0,Day,Coffee Type,Units Sold
1,Monday,Latte,15
3,Tuesday,Latte,20
0,Monday,Espresso,25
5,Wednesday,Latte,25
2,Tuesday,Espresso,30
7,Thursday,Latte,30
4,Wednesday,Espresso,35
9,Friday,Latte,35
13,Sunday,Latte,35
11,Saturday,Latte,35


In [29]:
for index, row in coffee.iterrows():
  print(index)
  print(row)

0
Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object
1
Day            Monday
Coffee Type     Latte
Units Sold         15
Name: 1, dtype: object
2
Day             Tuesday
Coffee Type    Espresso
Units Sold           30
Name: 2, dtype: object
3
Day            Tuesday
Coffee Type      Latte
Units Sold          20
Name: 3, dtype: object
4
Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: 4, dtype: object
5
Day            Wednesday
Coffee Type        Latte
Units Sold            25
Name: 5, dtype: object
6
Day            Thursday
Coffee Type    Espresso
Units Sold           40
Name: 6, dtype: object
7
Day            Thursday
Coffee Type       Latte
Units Sold           30
Name: 7, dtype: object
8
Day              Friday
Coffee Type    Espresso
Units Sold           45
Name: 8, dtype: object
9
Day            Friday
Coffee Type     Latte
Units Sold         35
Name: 9, dtype: object
10
Day            Saturday
Coffee Type 

## Loading in Dataframe from files

In [31]:
bios = pd.read_csv('./data/bios.csv')

In [None]:
bios.to_excel

## Filtering Data

In [None]:
bios.loc[bios['height_cm'] > 215, ['name', 'height_cm']]

In [40]:
bios[(bios['height_cm'] > 215) & (bios['born_country']=='USA')]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,
6722,6755,Shaquille O'Neal,1972-03-06,Newark,New Jersey,USA,United States,216.0,137.0,
6937,6972,David Robinson,1965-08-06,Key West,Florida,USA,United States,216.0,107.0,
123850,126093,Tyson Chandler,1982-10-02,Hanford,California,USA,United States,216.0,107.0,


In [41]:
# Filter by Name
bios[bios['name'].str.contains("Keith")]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
1897,1907,Keith Hanlon,1966-09-01,,,,Ireland,,,
3505,3517,Keith Wallace,1961-03-29,Preston,England,GBR,Great Britain,165.0,51.0,1999-12-31
6228,6255,Keith Hartley,1940-10-15,Vancouver,British Columbia,CAN,Canada,200.0,85.0,
8898,8946,Keith Mwila,1966-01-01,,,,Zambia,,,1993-01-09
12053,12118,Keith Hervey,1898-11-03,Fulham,England,GBR,Great Britain,,,1973-02-22
...,...,...,...,...,...,...,...,...,...,...
109900,111105,Keith Cumberpatch,1927-08-25,Christchurch,Canterbury,NZL,New Zealand,,,2013-11-15
115973,117348,Keith Sanderson,1975-02-02,Plymouth,Massachusetts,USA,United States,183.0,95.0,
117676,119195,Duncan Keith,1983-07-16,Winnipeg,Manitoba,CAN,Canada,185.0,88.0,
122121,124176,Keith Ferguson,1979-09-07,Sale,Victoria,AUS,Australia,176.0,78.0,


## Merging & Concatenating Data

## Handling Null Values

## Aggregating Data

#### Groupby function in Pandas

#### Pivot Tables

#### Using datetime with Groupby

## Advanced Functionality

## Advanced Functionality (cont.)
These two libraries didn't actually make it into final video


## New Functionality

## What Next???

Check out some of my other tutorials:
- [Cleaning Data w/ Pandas](https://www.youtube.com/live/oad9tVEsfI0?si=qnDOg9BSRFxcP5gZ)
- [Solving 100 Python Pandas Problems](https://youtu.be/i7v2m-ebXB4?si=VSJHnZryqMv8GW54)
- [Real-world Data Analsys Problems w/ Python Pandas](https://youtu.be/eMOA1pPVUc4)

Platforms to Try
- [Stratascratch](https://stratascratch.com/?via=keith)
- [Analyst Builder](https://www.analystbuilder.com/?via=keith)