Introduction to Data Frames

In [7]:
import pandas as pd

df = pd.DataFrame([
    [1, 2, 3],
    [4, 5, 6],
    [8, 9, 10]
], columns=["A", "B", "C"]
, index = ["X", "Y", "Z"])

In [85]:
#df.head(2) First 2 rows
#df.tail(2)  Last 2 rows
#df.columns Headers
#df.index row indices
#df.info() Gives info on bytes or data types
#df.unique("A") Unique values in column A
#df.shape() Gives shape of the data frame row vs columns
#df.size() Total number of items
df.describe() #Gives data info on mean, stddev, median

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.333333,5.333333,6.333333
std,3.511885,3.511885,3.511885
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,6.0,7.0,8.0
max,8.0,9.0,10.0


Loading Frames from Files

In [9]:
coffee = pd.read_csv("./warmup-data/coffee.csv")
results = pd.read_parquet("./data/results.parquet")
olympics_data = pd.read_excel("./data/olympics-data.xlsx", sheet_name = "results") #You can specify which excel sheet in that file
bios = pd.read_csv("./data/bios.csv")
#.to_excel, .to_xlsx to change the file type

Accessing Data with Pandas

In [82]:
coffee.head(10)
coffee #-> will print out everything, can also use display(coffee)
coffee.sample(10) #Gets 10 random
#coffee.loc[row_label, column_label] loc is name-based
#coffee.loc[0:3] Return the value and information at index names 0 to 3
#coffee.index = coffee.Day All the indices are set to their corresponding days
coffee.iloc[:, [0,2]] #iloc is postion-based

Unnamed: 0_level_0,Day,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Monday,Monday,25
Monday,Monday,15
Tuesday,Tuesday,30
Tuesday,Tuesday,20
Wednesday,Wednesday,35
Wednesday,Wednesday,25
Thursday,Thursday,40
Thursday,Thursday,30
Friday,Friday,45
Friday,Friday,35


In [None]:
#coffee.at[0, "Units Sold"]  Returns the value at the row-column intersection
coffee.iat[0, 1] #Integer based searching
#iat and at are optimized to search for a single value

'Espresso'

In [None]:
#coffee.Day gets the Day column but only works if one word
#coffee["Units Sold"] to get the Units Sold row

In [None]:
#coffee.sort_values("Units Sold", ascending = False) Sorting by Units sold column, ascending = True/False to reorder
coffee.sort_values(["Units Sold", "Coffee Type"]) #Sort by Units Sold then Coffee Type


In [None]:
#Iteration: Not recommended
for index, row in coffee.iterrows():
    print(index)
    print(row)
    print("\n\n")

Filtering Data

In [None]:
bios[(bios['height_cm'] > 215) & (bios['born_country'] == "USA")] #Filter the rows based on a condition
bios[bios['name'].str.contains("Keith|Patrick", case = False)] #Filter for string Keith or Patrick
#Can do much more with regular expressions
bios[(bios['born_country'].isin(["USA", "FRA", "GBR"])) & (bios['name'].str.startswith("Michael"))]


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
2737,2749,Michael Chang,1972-02-22,Hoboken,New Jersey,USA,United States,175.0,72.0,
3239,3251,Michael Tucker,1971-06-25,South Boston,Massachusetts,USA,United States,188.0,84.0,
8589,8636,Michael Carbajal,1967-09-17,Phoenix,Arizona,USA,United States,165.0,48.0,
9706,9761,Michael Mean,1947-02-18,Hertford,England,GBR,Great Britain,180.0,76.0,
11006,11063,Michael Budrock,1929-09-03,Yonkers,New York,USA,United States,,,1998-02-13
...,...,...,...,...,...,...,...,...,...,...
140344,143851,Michaela Walsh,1993-06-05,Belfast,Northern Ireland,GBR,Ireland,,,
143687,147341,Michael Cherry,1995-03-23,New York,New York,USA,United States,,,
143688,147342,Michael Norman,1997-12-03,San Diego,California,USA,United States,185.0,78.0,
143788,147444,Michael Grady,1996-10-22,Pittsburgh,Pennsylvania,USA,United States,196.0,91.0,


In [100]:
bios.query('born_country == "USA" & born_city == "Seattle"') #Another filter method is to use query

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
11030,11088,David Halpern,1955-08-18,Seattle,Washington,USA,United States,178.0,79.0,
12800,12870,Todd Trewin,1958-04-20,Seattle,Washington,USA,United States,180.0,75.0,
15476,15583,Scott McKinley,1968-10-15,Seattle,Washington,USA,United States,183.0,75.0,
29079,29293,Joyce Tanac,1950-09-27,Seattle,Washington,USA,United States,156.0,49.0,
31135,31371,Bill Kuhlemeier,1908-01-14,Seattle,Washington,USA,United States,,,2001-07-08
...,...,...,...,...,...,...,...,...,...,...
133392,136331,Hans Struzyna,1989-03-31,Seattle,Washington,USA,United States,188.0,91.0,
135448,138662,Maude Davis Crossland,2003-03-19,Seattle,Washington,USA,Colombia,,,
136993,140229,Jenell Berhorst,2003-12-13,Seattle,Washington,USA,United States,,,
143507,147159,Nevin Harrison,2002-06-02,Seattle,Washington,USA,United States,175.0,73.0,
