## Intro to Dataframes
This section introduces basic pandas DataFrame operations

In [None]:
# Import the pandas library for data manipulation and analysis
# Import numpy for numerical operations (often used with pandas)
import pandas as pd
import numpy as np

# Create a sample DataFrame with 4 rows and 3 columns
# Data is organized in a 2D table format with custom column names (A, B, C) and row labels (x, y, z, zz)
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12]], columns=["A", "B", "C"], index=["x","y","z",'zz'])

# Display the first 5 rows of the DataFrame (default)
# This helps you quickly see what your data looks like
df.head()

# Display the last 2 rows of the DataFrame
# Useful for checking the end of your dataset
df.tail(2)

# Show all column names in the DataFrame
# Returns an Index object containing the column labels
df.columns

# Get the row index labels and convert them to a regular Python list
# This shows you all the row identifiers
df.index.tolist()

# Display detailed information about the DataFrame
# Shows data types, memory usage, and number of non-null values for each column
df.info()

# Generate descriptive statistics for numerical columns
# Shows count, mean, standard deviation, min, max, and quartiles
df.describe()

# Count the number of unique values in each column
# Helps identify columns with categorical data or duplicates
df.nunique()

# Get unique values from column 'A' only
# Returns an array of distinct values found in this column
df['A'].unique()

# Return the dimensions of the DataFrame as (rows, columns)
# Shows how many rows and columns your data contains
df.shape

# Return the total number of elements in the DataFrame
# This is rows × columns
df.size

# Display the entire DataFrame
# Shows all data at once (use carefully with large datasets)
df

## Loading in Dataframes from Files
This section shows how to load data from various file formats into pandas DataFrames

In [None]:
# Read data from CSV files into pandas DataFrames
# coffee.csv contains coffee sales data
# bios.csv contains athlete biographical information
coffee = pd.read_csv('./warmup-data/coffee.csv')
results = pd.read_parquet('./data/results.parquet')
bios = pd.read_csv('./data/bios.csv')

# To read an excel spreadsheet
# Read data from an Excel file, specifying which sheet to load
# This loads the 'results' sheet from the olympics-data.xlsx file
olympics_data = pd.read_excel('./data/olympics-data.xlsx', sheet_name="results")

## Accessing Data with Pandas
This section demonstrates different ways to view and access data in pandas DataFrames

In [None]:
# Print the coffee DataFrame to console (basic output)
print(coffee)

# Display the coffee DataFrame with better formatting (Jupyter-friendly)
display(coffee)

# Show first 5 rows of coffee data
coffee.head()

# Show last 10 rows of the coffee DataFrame
# Good for checking recent data entries
coffee.tail(10)

# Get 5 random rows from the coffee DataFrame
# Useful for getting a representative sample of your data
# Note: Use random_state parameter for reproducible results
coffee.sample(5) # Pass in random_state to make deterministic

# Using .loc for label-based indexing
# Syntax: coffee.loc[Rows, Columns]
# Get the row with index label 0 (first row)
coffee.loc[0]

# Get multiple specific rows by their index labels
# Returns rows with index 0, 1, and 5
coffee.loc[[0,1,5]]

# Get a slice of rows (from index 5 to 9) and specific columns
# This selects rows 5 through 9 and only the 'Day' and 'Units Sold' columns
coffee.loc[5:9, ["Day", "Units Sold"]]

#### iloc - Integer position-based indexing
# Get all rows (:) and columns at positions 0 and 2
# This returns the first and third columns for every row
coffee.iloc[:, [0,2]]

#### Other Stuff
# Set the DataFrame index to use the 'Day' column values
# This makes it easier to select data by day names
coffee.index = coffee["Day"]

# Now we can select rows by day names instead of numbers
# Get all rows from Monday through Wednesday
coffee.loc["Monday":"Wednesday"]

# Reload the coffee data to reset our changes (revert index back to default)
coffee = pd.read_csv('./warmup-data/coffee.csv')

#### Setting Values
# Change the 'Units Sold' values for rows 1 through 3 to 10
# This updates multiple cells at once with the same value
coffee.loc[1:3, "Units Sold"] = 10

#### Optimized way to get single values (.at & .iat)
# .at gets a single value by label (faster than .loc for single values)
# Gets the 'Units Sold' value from row with index 0
coffee.at[0,"Units Sold"]

# .iat gets a single value by integer position (faster than .iloc)
# Gets the value at row 3, column 1 (0-indexed)
coffee.iat[3,1]

#### Getting Columns
# Two ways to access a column:
# Method 1: Using dot notation (works when column names don't have spaces)
coffee.Day

# Method 2: Using bracket notation (works for all column names)
coffee["Day"]

#### Sort Values
# Sort the DataFrame by 'Units Sold' column in descending order (highest first)
coffee.sort_values(["Units Sold"], ascending=False)

# Sort by multiple columns: first by 'Units Sold' (descending), then by 'Coffee Type' (ascending)
# ascending=[0,1] means: 0=False (descending), 1=True (ascending)
coffee.sort_values(["Units Sold", "Coffee Type"], ascending=[0,1])

#### Iterate over dataframe with for loop
# Loop through each row in the DataFrame
# index = row index, row = Series containing all column values for that row
for index, row in coffee.iterrows():
    print(index)  # Print the row index
    print(row)    # Print all column values for this row
    print("Coffee Type of Row:", row["Coffee Type"])  # Print specific column value

## Filtering Data
This section covers how to filter DataFrames based on various conditions

In [None]:
# Show first few rows of the bios DataFrame to understand its structure
bios.head()

# Filter rows where height_cm is greater than 215 cm
# This returns only the athletes who are taller than 215 cm
bios.loc[bios["height_cm"] > 215]

# Filter rows with height > 215 AND select only specific columns
# Returns just the name and height columns for tall athletes
bios.loc[bios["height_cm"] > 215, ["name", "height_cm"]]

#### Short-hand syntax (without .loc)
# Alternative way to filter and select columns in one line
# First filter by condition, then select columns using bracket notation
bios[bios['height_cm'] > 215][["name","height_cm"]]

#### Multiple filter conditions
# Filter with multiple conditions using & (AND operator)
# Find athletes who are both tall (height > 215) AND from USA
bios[(bios['height_cm'] > 215) & (bios['born_country']=='USA')]

#### Filter by string conditions
# Find rows where the 'name' column contains the text "keith" (case-insensitive)
# case=False means it matches both "Keith" and "keith"
bios[bios['name'].str.contains("keith", case=False)]

# Regex syntax
# Find names containing either "keith" OR "patrick" using regex pattern
# The | symbol means "or" in regex
bios[bios['name'].str.contains('keith|patrick', case=False)]

# Other cool regex filters

# Find athletes born in cities that start with a vowel (A, E, I, O, U):
# ^ means start of string, [AEIOUaeiou] means any vowel character
vowel_cities = bios[bios['born_city'].str.contains(r'^[AEIOUaeiou]', na=False)]

# Find athletes with names that contain exactly two vowels:
# This complex regex matches names with exactly 2 vowel characters
# [^AEIOUaeiou] means any non-vowel character
# The pattern ensures exactly 2 vowels with any non-vowels in between
# Note: This is quite complex - just shows the power of regex!
two_vowels = bios[bios['name'].str.contains(r'^[^AEIOUaeiou]*[AEIOUaeiou][^AEIOUaeiou]*[AEIOUaeiou][^AEIOUaeiou]*$', na=False)]

# Find athletes with names that have repeated consecutive letters:
# (.)\1 means capture any character (.) then match the same character again (\1)
# This finds names like "Aaron" (aa) or "Emmett" (tt)
repeated_letters = bios[bios['name'].str.contains(r'(.)\1', na=False)]

# Find athletes with names ending in 'son' or 'sen':
# son$ means 'son' at the end of string ($)
# sen$ means 'sen' at the end of string
# | means OR, so matches either pattern
son_sen_names = bios[bios['name'].str.contains(r'son$|sen$', case=False, na=False)]

# Find athletes born in a year starting with '19':
# ^19 means the string starts with '19' (for years like 1985, 1992)
born_19xx = bios[bios['born_date'].str.contains(r'^19', na=False)]

# Find athletes with names that do not contain any vowels:
# ^[^AEIOUaeiou]*$ means start (^) and end ($) with zero or more (*) non-vowels
no_vowels = bios[bios['name'].str.contains(r'^[^AEIOUaeiou]*$', na=False)]

# Find athletes whose names contain a hyphen or an apostrophe:
# [-'] matches either a hyphen (-) or apostrophe (')
hyphen_apostrophe = bios[bios['name'].str.contains(r"[-']", na=False)]

# Find athletes with names that start and end with the same letter:
# ^(.).*\1$ captures first character (.), matches anything in between (.*), then matches the same first character (\1)
start_end_same = bios[bios['name'].str.contains(r'^(.).*\1$', na=False, case=False)]

# Find athletes with a born_city that has exactly 7 characters:
# ^.{7}$ means exactly 7 of any characters (.) between start (^) and end ($)
city_seven_chars = bios[bios['born_city'].str.contains(r'^.{7}$', na=False)]

# Find athletes with names containing three or more vowels:
# ([AEIOUaeiou].*){3,} means match vowel + any characters, repeated 3 or more times
three_or_more_vowels = bios[bios['name'].str.contains(r'([AEIOUaeiou].*){3,}', na=False)]

# Don't use regex search (exact match)
# This looks for the exact string 'keith|patrick' (not using regex pattern matching)
bios[bios['name'].str.contains('keith|patrick', case=False, regex=False)]

## isin method & startswith
# Find athletes from specific countries (USA, FRA, GBR) whose names start with "Keith"
# isin() checks if values are in a list, startswith() checks beginning of strings
bios[bios['born_country'].isin(["USA", "FRA", "GBR"]) & (bios['name'].str.startswith("Keith"))]

print("Make sure to smash that like button & subscribe tehehehe")

#### Query functions
# Alternative way to filter using query syntax (more readable for complex conditions)
# Find athletes born in USA in the city of Seattle
bios.query('born_country == "USA" and born_city == "Seattle"')

## Adding / Removing Columns
This section shows how to modify DataFrame structure by adding, removing, and renaming columns

In [None]:
# Show first few rows of coffee data to see current structure
coffee.head()

# Add a new column called 'price' with value 4.99 for all rows
coffee['price'] = 4.99

# Add a new column with conditional values using numpy.where()
# If Coffee Type is 'Espresso', price is 3.99, otherwise 5.99
coffee['new_price'] = np.where(coffee['Coffee Type']=='Espresso', 3.99, 5.99) 

# Display the DataFrame to see the new columns
coffee

# Remove the 'price' column permanently (inplace=True modifies the original DataFrame)
coffee.drop(columns=['price'], inplace=True)

# Alternative way to remove columns (creates a new DataFrame instead of modifying original)
# coffee = coffee.drop(columns=['price'])

# Reorder columns to a specific sequence
coffee = coffee[['Day', 'Coffee Type', 'Units Sold', 'new_price']]

# Create a calculated column: revenue = units sold × price
coffee['revenue'] = coffee['Units Sold'] * coffee['new_price']

# Display the updated DataFrame
coffee

# Rename the 'new_price' column to just 'price'
coffee.rename(columns={'new_price': 'price'}, inplace=True)

# Create a copy of the bios DataFrame for modification
bios_new = bios.copy()

# Extract first name from full name by splitting on spaces and taking first element
bios_new['first_name'] = bios_new['name'].str.split(' ').str[0]

# Filter to show only athletes with first name "Keith"
bios_new.query('first_name == "Keith"')

# Convert the born_date column from string to datetime format
bios_new['born_datetime'] = pd.to_datetime(bios_new['born_date'])

# Extract just the year from the datetime column
bios_new['born_year'] = bios_new['born_datetime'].dt.year

# Show name and birth year columns
bios_new[['name','born_year']]

# Save the modified DataFrame to a new CSV file
bios_new.to_csv('./data/bios_new.csv', index=False)

# Add a categorical column based on height using lambda function
# Short: < 165cm, Average: 165-184cm, Tall: ≥ 185cm
bios['height_category'] = bios['height_cm'].apply(lambda x: 'Short' if x < 165 else ('Average' if x < 185 else 'Tall'))

# Define a custom function to categorize athletes based on height and weight
def categorize_athlete(row):
    if row['height_cm'] < 175 and row['weight_kg'] < 70:
        return 'Lightweight'
    elif row['height_cm'] < 185 or row['weight_kg'] <= 80:
        return 'Middleweight'
    else:
        return 'Heavyweight'

# Apply the custom function to each row (axis=1 means row-wise)
bios['Category'] = bios.apply(categorize_athlete, axis=1)

# Show first few rows to see the new Category column
bios.head()

## Merging & Concatenating Data
This section covers combining DataFrames from different sources

In [None]:
# Read the NOC (National Olympic Committee) regions data
nocs = pd.read_csv('./data/noc_regions.csv')

# Merge bios and nocs DataFrames based on country codes
# left_on='born_country' means use 'born_country' column from bios DataFrame
# right_on='NOC' means use 'NOC' column from nocs DataFrame
# how='left' means keep all rows from bios, add matching data from nocs
bios_new = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left')

# Rename the 'region' column to 'born_country_full' for clarity
bios_new.rename(columns={'region': 'born_country_full'}, inplace=True)

# Create separate DataFrames for USA and Great Britain athletes
usa = bios[bios['born_country']=='USA'].copy()
gbr = bios[bios['born_country']=='GBR'].copy()

# Combine the USA and GBR DataFrames vertically (stack them on top of each other)
new_df = pd.concat([usa,gbr])

# Show the last few rows of the combined DataFrame
new_df.tail()

# Merge results and bios DataFrames on athlete_id column
# This combines performance data with athlete biographical data
combined_df = pd.merge(results, bios, on='athlete_id', how='left')

# Show first few rows of the merged DataFrame
combined_df.head()

## Handling Null Values
This section shows how to deal with missing data in DataFrames

In [None]:
# Set specific cells in 'Units Sold' column to NaN (Not a Number - missing values)
coffee.loc[[2,3], 'Units Sold'] = np.nan

# Fill missing values with the mean (average) of the column
# This replaces NaN values with the calculated average
# Make sure to set this to your Units Sold column if you want these changes to stick
coffee['Units Sold'].fillna(coffee['Units Sold'].mean()) 

# Alternative method: interpolate missing values
# This fills gaps by estimating values based on surrounding data points
# coffee['Units Sold'] = coffee['Units Sold'].interpolate()
coffee['Units Sold'].interpolate()

# Remove rows that have missing values in the 'Units Sold' column
# Use inplace=True if you want to update the coffee df permanently
coffee.dropna(subset=['Units Sold'])

# Alternative way: filter to keep only rows where 'Units Sold' is NOT null
coffee[coffee['Units Sold'].notna()]

# Display current state of the DataFrame
coffee

## Aggregating Data
This section covers grouping and summarizing data

In [None]:
# Show first few rows of bios data to understand structure
bios.head()

# Count how many times each city appears in the born_city column
# This shows the most common birth cities
bios['born_city'].value_counts()

# For USA athletes only, count how many are from each region
# Show top 10 most common regions
bios[bios['born_country']=='USA']['born_region'].value_counts().head(10)

# Show bottom 25 least common regions for USA athletes
bios[bios['born_country']=='USA']['born_region'].value_counts().tail(25)

#### Groupby function in Pandas
# Group coffee data by Coffee Type and sum the Units Sold for each type
coffee.groupby(['Coffee Type'])['Units Sold'].sum()

# Group by Coffee Type and calculate average Units Sold for each type
coffee.groupby(['Coffee Type'])['Units Sold'].mean()

# Group by multiple columns and apply different aggregation functions
# Sum Units Sold and calculate average price for each Coffee Type and Day combination
coffee.groupby(['Coffee Type', 'Day']).agg({'Units Sold': 'sum', 'price': 'mean'})

#### Pivot Tables
# Create a pivot table showing revenue by Day (rows) and Coffee Type (columns)
pivot = coffee.pivot(columns='Coffee Type', index='Day', values='revenue')

# Sum all values in each column (total revenue by coffee type)
pivot.sum()

# Sum all values in each row (total revenue by day)
pivot.sum(axis=1)

#### Using datetime with Groupby
# Convert born_date column to datetime format for time-based operations
bios['born_date'] = pd.to_datetime(bios['born_date'])

# Extract month from birth dates
bios['month_born'] = bios['born_date'].dt.month

# Extract year from birth dates
bios['year_born'] = bios['born_date'].dt.year

# Group by birth year and month, count athletes born in each period
# reset_index() converts the result back to a regular DataFrame
# sort_values() arranges results by count in descending order
bios.groupby([bios['year_born'],bios['month_born']])['name'].count().reset_index().sort_values('name', ascending=False)

## Advanced Functionality
This section covers more advanced pandas operations

In [None]:
# shift() - move data up or down, rank() - assign rankings, cumsum() - cumulative sum, rolling() - moving window calculations

# Create a subset containing only Latte sales
latte = coffee[coffee['Coffee Type']=="Latte"].copy()

# Calculate 3-day rolling sum of Units Sold
# This creates a moving total that includes the current day plus previous 2 days
latte['3day'] = latte['Units Sold'].rolling(3).sum()

# Display the results
latte

## Advanced Functionality (cont.)
This section demonstrates additional libraries that work with pandas

In [None]:
These two libraries didn't actually make it into final video

# Install and import pyjanitor for data cleaning utilities
!pip install pyjanitor
import janitor

# Clean column names (remove spaces, special characters, make lowercase)
coffee.clean_names()

# Install and import skimpy for quick data summaries
!pip install skimpy
from skimpy import skim

# Generate a comprehensive summary of the results DataFrame
skim(results)

# Show basic DataFrame information
coffee.info()

## New Functionality
This section shows newer pandas features and additional examples

In [None]:
# Read CSV file with default numpy backend
results_numpy = pd.read_csv('./data/results.csv')

# Read CSV file with pyarrow backend (faster performance)
results_arrow = pd.read_csv('./data/results.csv', engine='pyarrow', dtype_backend='pyarrow')

# Show memory usage and data types with numpy backend
results_numpy.info()

# Show memory usage and data types with pyarrow backend
results_arrow.info()

# Filter bios data for athletes from New Hampshire OR San Francisco
filtered_bios = bios[(bios['born_region'] == 'New Hampshire') | (bios['born_city'] == 'San Francisco')]

# Show first few rows of the bios DataFrame
bios.head()

## Additional DataFrame Example
Creating a sample DataFrame with sales data

In [None]:
import pandas as pd

# Creating a DataFrame with sample sales data
data = {
    'Date': ['2024-05-01', '2024-05-01', '2024-05-01', '2024-05-02', '2024-05-02', '2024-05-03', '2024-05-03', '2024-05-03'],
    'Item': ['Apple', 'Banana', 'Orange', 'Apple', 'Banana', 'Orange', 'Apple', 'Orange'],
    'Units Sold': [30, 21, 15, 40, 34, 20, 45, 25],
    'Price Per Unit': [1.0, 0.5, 0.75, 1.0, 0.5, 0.75, 1.0, 0.75],
    'Salesperson': ['John', 'John', 'John', 'Alice', 'Alice', 'John', 'Alice', 'John']
}

df = pd.DataFrame(data)

# Display the DataFrame
df

# Create a pivot table showing total units sold by date and item
# This reorganizes the data to show a summary view
pivot_table = pd.pivot_table(df, values='Units Sold', index='Date', columns='Item', aggfunc='sum')
pivot_table

## Data Visualization Example
Creating a histogram to visualize athlete height distribution

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is named 'bios' and already loaded
# First, filter out rows where the height_cm data is missing
bios_filtered = bios.dropna(subset=['height_cm'])

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(bios_filtered['height_cm'], bins=20, color='blue', edgecolor='black')

plt.title('Distribution of Athlete Heights in Olympics')
plt.xlabel('Height in cm')
plt.ylabel('Number of Athletes')
plt.grid(True)

# Using a logarithmic scale for the y-axis if the data spread is wide
plt.yscale('log')

plt.show()

## What Next???
Check out some of my other tutorials:
- [Cleaning Data w/ Pandas](https://www.youtube.com/live/oad9tVEsfI0?si=qnDOg9BSRFxcP5gZ)
- [Solving 100 Python Pandas Problems](https://youtu.be/i7v2m-ebXB4?si=VSJHnZryqMv8GW54)
- [Real-world Data Analysis Problems w/ Python Pandas](https://youtu.be/eMOA1pPVUc4)

Platforms to Try
- [Stratascratch](https://stratascratch.com/?via=keith)
- [Analyst Builder](https://www.analystbuilder.com/?via=keith)