In [1]:
# Kernel / venv verification
# Run this first. No need to edit
import sys, importlib
print('python executable:', sys.executable)
print('ipykernel available:', importlib.util.find_spec('ipykernel') is not None)
import pandas as pd
print('pandas version:', pd.__version__)


python executable: c:\Users\Ray\.venv\Scripts\python.exe
ipykernel available: True
pandas version: 2.3.3


# Refresher

In [4]:
#open the file "pittsburgh-weather-2024.csv"
with open('pittsburgh-weather-2024.csv', 'r') as file:
    lines = file.readlines()

#look at the first 5 lines of the file
print(lines[:5])
print()

#extracting daily high temperatures from the CSV file
daily_highs = []
for line in lines[1:]: #first line is the header
    row = line.strip().split(',')
    daily_highs.append(float(row[1]))
print("Daily high temperatures:", daily_highs)
print()

#Using list of daily highs to calculate the average high temperature
total_high = 0
for t in daily_highs:
    total_high += t

average_high = total_high / (len(daily_highs))

print("Average high temperature:", average_high)

['Day,High,Low,Precipitation,Snow\n', 'January 1,38.0,32.0,0.08,0.4\n', 'January 2,35.0,32.0,0.0,0.0\n', 'January 3,41.0,32.0,0.0,0.0\n', 'January 4,36.0,27.0,0.0,0.0\n']

Daily high temperatures: [38.0, 35.0, 41.0, 36.0, 38.0, 35.0, 36.0, 40.0, 50.0, 42.0, 43.0, 46.0, 46.0, 29.0, 22.0, 19.0, 20.0, 29.0, 27.0, 19.0, 25.0, 39.0, 45.0, 56.0, 59.0, 64.0, 47.0, 46.0, 36.0, 40.0, 45.0, 48.0, 47.0, 47.0, 55.0, 48.0, 46.0, 54.0, 63.0, 63.0, 63.0, 48.0, 46.0, 44.0, 39.0, 56.0, 39.0, 29.0, 37.0, 42.0, 52.0, 59.0, 49.0, 58.0, 37.0, 47.0, 65.0, 66.0, 64.0, 39.0, 52.0, 53.0, 66.0, 74.0, 75.0, 62.0, 50.0, 65.0, 56.0, 41.0, 52.0, 68.0, 71.0, 74.0, 61.0, 61.0, 56.0, 36.0, 47.0, 50.0, 41.0, 54.0, 43.0, 48.0, 67.0, 54.0, 61.0, 52.0, 53.0, 59.0, 59.0, 64.0, 63.0, 62.0, 47.0, 43.0, 54.0, 61.0, 74.0, 76.0, 66.0, 69.0, 60.0, 60.0, 82.0, 74.0, 80.0, 82.0, 73.0, 70.0, 59.0, 49.0, 62.0, 68.0, 61.0, 57.0, 70.0, 81.0, 83.0, 83.0, 73.0, 80.0, 85.0, 87.0, 65.0, 78.0, 73.0, 78.0, 81.0, 65.0, 62.0, 60.0, 66.0, 80.0

# 1D and 2D arrays

In [5]:
import numpy as np


### 1D 

In [6]:
arr1 = np.array([1, 2, 3, 4, 5])

In [7]:
print(arr1)

[1 2 3 4 5]


**array.dtype** : This tells you the data type of the elements inside the array.

**type(array)** : This tells you the type of the object itself — i.e., what kind of Python object the array is.



In [8]:
print(arr1.dtype)
print(type(arr1))

int64
<class 'numpy.ndarray'>


### 2D

In [9]:
arr2 = np.array([[1, 2, 3],
                [4, 5, 6]])


In [10]:
print(arr2)

[[1 2 3]
 [4 5 6]]


In [11]:
print(type(arr2))
print(arr2.dtype)

<class 'numpy.ndarray'>
int64


### Upcasting
In NumPy, **Upcasting** refers to converting mixed data types to a single, more general type (like converting integers to strings) to ensure the array is homogeneous.

specific -> general

In [13]:
arr3 = np.array([[1, 2, 3, 4, 5],
                 ['one', 'two', 'three', 'four', 'five']])

In [14]:
print(arr3)

[['1' '2' '3' '4' '5']
 ['one' 'two' 'three' 'four' 'five']]


In [15]:
print(type(arr3))
print(arr3.dtype)

<class 'numpy.ndarray'>
<U21


That means NumPy has inferred the array’s data type as Unicode string with up to 11 characters.

NumPy sees mixed types (integers and strings), and it automatically upcasts the integers to strings so all elements share a common type.

# Creating Arrays

In [None]:
np.array([1, 2, 3, 4, 5])

In [None]:
np.arange(0,10,2)

In [None]:
np.linspace(0,1,5)

In [None]:
np.zeros((3,2))

In [None]:
np.ones(4)

In [None]:
a = np.random.rand(3,4)

In [None]:
print(a)

In [None]:
print(arr2[0]) # accessing the first array
print(type(arr2[0]))
print(type(arr2[0][0]))

In [None]:
print(arr2[1]) # accessing the second array
print(type(arr2[1]))
print(type(arr2[1][0]))

In [None]:
print(arr2[0, 0]) # accessing the first element
print(type(arr2[0, 0]))

# Array Operations

### Element-wise math

In [None]:
arr = np.array([1, 2, 3, 4, 5])
arr

In [None]:
arr + 10

In [None]:
arr - 5

In [None]:
arr * 3

In [None]:
arr / 5

In [None]:
arr ** 2

### Broadcasting

In [None]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])
arr

In [None]:
print(10)
arr + 10

In [None]:
print(np.array(10))
arr + np.array(10)

In [None]:
print(np.array([[10, 10, 10],[10, 10, 10],[10, 10, 10]]))
arr + np.array([[10, 10, 10],[10, 10, 10],[10, 10, 10]])

### Aggregation

In [None]:
arr = np.array([1, 2, 3, 4, 5])
arr

In [None]:
np.mean(arr)

In [None]:
np.sum(arr)

In [None]:
np.min(arr)

In [None]:
np.max(arr)

In [None]:
np.std(arr)

Also works on 2D Arrays

In [None]:
arr2 = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])
arr2

In [None]:
np.mean(arr2)

In [None]:
np.sum(arr2)

In [None]:
np.min(arr2)

In [None]:
np.max(arr2)

In [None]:
np.std(arr2)

# Array Indexing and Slicing

In [None]:
arr

In [None]:
arr[0]

In [None]:
arr[1:3]

In [None]:
arr2

In [None]:
arr2[0]

In [None]:
arr2[0,2]

In [None]:
arr2[1,1:]

### Boolean Indexing

In [None]:
arr[arr >= 4]

In [None]:
arr2[arr2 % 2 == 0]

### Shape and Reshape

In [None]:
print(arr)
print(arr.shape)
print(arr.size)
print(arr.ndim)

In [None]:
print(arr2)
print(arr2.shape)
print(arr2.size)
print(arr2.ndim)


### Reshape

In [None]:
arr2.reshape(3, 2) 

In [None]:
arr2.reshape(9,1)

In [None]:
arr2.flatten()

In [None]:
array = np.array([[1, 2, 3],[4,5,6]])

print(array)

In [None]:
array.reshape(3,2)

# Numpy Exercise

In [None]:
print(daily_highs)

In [None]:
#Using np.array to find average daily high
daily_highs_array = np.array(daily_highs)
average_high = np.mean(daily_highs_array)

print("Average high temperature:", average_high)

# Pandas

In [None]:
import pandas as pd

## Creating Data

In [None]:
daily_highs

In [None]:
pgh_weather = pd.Series(    data=daily_highs,
    name='Pittsburgh Daily Highs'
)

In [None]:
pgh_weather

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows in the Series

In [None]:
pgh_weather

In [None]:
pd.set_option('display.max_rows', 20) 

In [None]:
df = pd.DataFrame({
    'Day': [f'Day {i+1}' for i in range(len(daily_highs))],
    'High': daily_highs
})

In [None]:
df

In [None]:
df = pd.read_csv('pittsburgh-weather-2024.csv')

In [None]:
df

## Data Exploration

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df = pd.read_csv('pittsburgh-weather-2024.csv')

In [None]:
df.columns

# Selecting and Filtering Data

In [None]:
df['Day']

In [None]:
df[['Day', 'Snow']]

In [None]:
df[df['Day'] == 'December 13']

In [None]:
df[df['High'] >= 89]

In [None]:
df[df['Low'] == 13.0]

In [None]:
df.iloc[3]

In [None]:
df.loc[3]

In [None]:
df.loc['December 13']

In [None]:
df.set_index('Day', inplace = True)
df.head()


In [None]:
df.loc['January 4']

In [None]:
df.loc[3] #Doesn't work after setting index

In [None]:
df.iloc[3] # Still works with iloc

In [None]:
df.head()

## Pandas Exercise 

Load all_olympic_medalists.csv

Do the following tasks/ Answer the following questions:
1. Show the first 5 rows and the last 5 rows.
2. How many rows are in the dataset?
3. Are there any missing values in any of the columns? 
4. How many total medals are recorded in the dataset?
5. How many medals were awarded in the first Olympic year (1896)? How would you do this if you didn't know the first year?
6. How many medals has the US won?
7. What years were medals awarded for Rugby?
8. What was the first year women were included?

In [None]:
df = pd.read_csv('all_olympic_medalists.csv')

In [None]:
#show the first 5 rows
df.head()

In [None]:
#show the last 5 rows
df.tail()

In [None]:
# Are there any missing values in any of the columns?
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Show the missing values in the 'medal' column
df[df['medal'].isnull()]

In [None]:
# How many total medals are recorded in the dataset?
df['medal'].count()

In [None]:
#don't use len() -- includes NaN values
len(df['medal'])

In [None]:
# How many medals were awarded in the first Olympic year (1896)?
df[df['year'] == df['year'].min()]['medal'].count()

In [None]:
# 6. How many medals has the US won?
df[df['country'] == 'United States']['medal'].count()

In [None]:
# What years did anyone medal for "Rugby"?
df[df['sport'] == 'Rugby']['year'].unique()

In [None]:
# What was the first year women were included?
df[df['event_gender'] == 'Women\'s']['year'].min()

## Pandas Cleaning Tasks

In [None]:
df= pd.read_csv('all_olympic_medalists.csv')

In [None]:
df.head()

In [None]:
#show missing values in the 'medal' column
df[df['medal'].isnull()]

In [None]:
df.fillna({'country': 'Unknown'}, inplace=True) 

In [None]:
df[df['medal'].isnull()]

In [None]:
# remove rows with missing values in the 'medal' column
df.dropna(subset=['medal'], inplace=True)

In [None]:
df[df['medal'].isnull()]

In [None]:
df.columns

In [None]:
df.rename(columns={'event_gender':'gender'}, inplace=True)

In [None]:
df.columns

In [None]:
df['year'] = df['year'].astype(str)

In [None]:
df['year'] = df['year'].astype(int)

In [None]:
df['event'] = df['gender']+ ' ' + df['event_name']

In [None]:
df.head()

In [None]:
#function to convert year (XXXX) to Y format
def convert_year_to_datetime(year):
    return pd.to_datetime(year, format='%Y')

df['year'] = df['year'].apply(convert_year_to_datetime)
df.head()

In [None]:
#change 'year' column back to just the year
df['year'] = df['year'].dt.year

In [None]:
df.head()

In [None]:
us_df = df.groupby('country').get_group('United States').groupby('year').size().sort_values(ascending=False)

In [None]:
pd.set_option('display.max_rows', None)  # Show all rows in the Series

In [None]:
us_df

In [None]:
pd.set_option('display.max_rows', 10) 

# Matplotlib      

In [None]:
import matplotlib.pyplot as plt

In [None]:
#plot the number of medals won by the US over the years
us_df = df.groupby('country').get_group('United States').groupby('year').size()
us_df.plot(kind='bar', figsize=(12, 6), color='green', title='Number of Medals Won by the US')


In [None]:
#plot number of  medalists in the Olympics over the years
us_golds = df.groupby('country').get_group('United States').groupby('medal').get_group('Gold').groupby('year').size()
us_golds.plot(kind='bar', figsize=(12, 6), color='gold', title='Number of Gold Medals Awarded to the US')

In [None]:
us_golds.head()

In [None]:
#plot the number of medals won by US in the Olympics split by medal type
df.head()

In [None]:
#filter for US women
us = df[ (df['country_code'] == 'USA') ]

# Group by year and medal type, then count
medal_counts = us.groupby(['year', 'medal']).size().unstack(fill_value=0)

# Reorder medal columns: Bronze, Silver, Gold
medal_counts = medal_counts[['Bronze', 'Silver', 'Gold']]

# Plot the stacked bar chart
medal_counts.plot(
    kind='bar',
    stacked=True,
    figsize=(12, 6),
    color={'Gold': "#F7D722", 'Silver': '#C0C0C0', 'Bronze': '#CD7F32'}
)

plt.title('US Olympic Medals Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Medal')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Pandas Data Cleaning Exercise

In all questions referring to gross, use the adjusted gross (in 2022 dollars)

- Drop the column 'Ref.'
- Rename 'Adjusted gross (in 2022 dollars)' to 'Adjusted gross'.
- List the artists and the number of times they appear in the rankings.
- What Taylor Swift tours made the rankings?
- Clean the actual gross and adjusted gross columns and convert them to float.
- What is the total adjusted gross from all 20 concerts?
- Calculate average gross/show for each rank using the adjusted gross. 
- How much has Taylor Swift earned from the concerts on this list?

In [None]:
df = pd.read_csv('top-20-womens-tours.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
#- Drop the column 'Ref.'
df.drop(columns=['Ref.'], inplace=True)

In [None]:
#- Rename 'Adjusted gross (in 2022 dollars)' to 'Adjusted gross'.
df.columns

In [None]:
df.rename(columns={'Adjusted gross (in 2022 dollars)': 'Adjusted gross'}, inplace=True)
df.columns

In [None]:
#- List the artists and the number of times they appear in the rankings.
df['Artist'].value_counts()

In [None]:
df.groupby('Artist')['Tour title'].count()

In [None]:
#- What Taylor Swift tours made the rankings?
taylor = df.groupby('Artist').get_group('Taylor Swift')
taylor['Tour title']

In [None]:
df[df['Artist'] == 'Taylor Swift']['Tour title']

In [None]:
# - Clean the actual gross and adjusted gross columns and convert them to float.
def clean_currency(value):
    if isinstance(value, str):
        value = value.replace('$', '').replace(',', '')
        return float(value)
    
df['Actual gross'] = df['Actual gross'].apply(clean_currency)
df['Adjusted gross'] = df['Adjusted gross'].apply(clean_currency)

df.head()

In [None]:
#- What is the total adjusted gross from all 20 concerts?
df['Adjusted gross'].sum()

In [None]:
#- Calculate average gross/show for each rank using the adjusted gross. 
df['Average gross'] = round(df['Adjusted gross'] / df['Shows'], 2)
df

In [None]:
#- How much has Taylor Swift earned from the concerts on this list?
df[df['Artist'] == 'Taylor Swift']['Adjusted gross'].sum()

In [None]:
df.groupby('Artist')['Adjusted gross'].sum().loc['Taylor Swift']

### Merging and Concatenation (pd.merge vs pd.concat) 

When combining data:
- Use **pd.concat** to stack DataFrames (add rows or columns). Good for appending new records or combining same-schema data.
- Use **pd.merge** (SQL-style joins) when you need to join on keys (one-to-many, many-to-one, etc.).

This section shows examples and quick exercises to practice.
# Merge examples
import pandas as pd

# Example DataFrames for merges
df_people = pd.DataFrame({
    'person_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

df_scores = pd.DataFrame({
    'person_id': [1, 2, 2, 4],
    'score': [95, 80, 82, 77]
})

print("people\n", df_people)
print("\nscores\n", df_scores)

# Inner join (only matching keys)
inner = pd.merge(df_people, df_scores, on='person_id', how='inner')
print("\ninner merge:\n", inner)

# Left join (all people, scores where available)
left = pd.merge(df_people, df_scores, on='person_id', how='left')
print("\nleft merge:\n", left)

# Demonstrate suffixes and multiple keys

df_a = pd.DataFrame({'id':[1,2], 'val':[10,20]})
df_b = pd.DataFrame({'id':[1,2], 'val':[100,200]})
joined = pd.merge(df_a, df_b, on='id', suffixes=('_a', '_b'))
print("\nsuffixes example:\n", joined)

# Use indicator to see join origin
print('\nouter merge with indicator:\n', pd.merge(df_people, df_scores, on='person_id', how='outer', indicator=True))
# Concatenation examples

import pandas as pd

df1 = pd.DataFrame({'A':[1,2], 'B':[3,4]})
df2 = pd.DataFrame({'A':[5,6], 'B':[7,8]})
# Stack rows (vertical)
v = pd.concat([df1, df2], ignore_index=True)
print("vertical concat:\n", v)

# Concatenate with mismatched columns (outer join style)
df3 = pd.DataFrame({'A':[9], 'C':[10]})
v2 = pd.concat([df1, df3], ignore_index=True, sort=False)
print("\nconcat with mismatched columns:\n", v2)

# Concatenate side-by-side (axis=1)
h = pd.concat([df1, df2], axis=1)
print("\nhorizontal concat (axis=1):\n", h)

# keys to create hierarchical index when concatenating multiple groups
grouped = pd.concat([df1, df2], keys=['g1','g2'])
print("\nconcat with keys:\n", grouped)
### Exercises (try before peeking at solutions) 

1. Use `df_olympics` and the existing `df_continents` example: left-merge them and compute medal counts per continent.
2. Given two DataFrames with the same columns but different rows, concatenate them and reset the index.
3. Find a case that produces a many-to-many join (duplicate keys on both frames) and observe the result — what happens?

(Hints: use `indicator=True` and `validate='one_to_many'` to check behavior.)
# Solutions (run to check)
import pandas as pd
# 1. Merge olympics -> continents, count medals per continent
if 'df_olympics' in globals() and 'df_continents' in globals():
    merged = pd.merge(df_olympics, df_continents, on='country_code', how='left')
    print("Medals by continent:\n", merged.groupby('continent')['medal'].count().sort_values(ascending=False).head())
else:
    print("df_olympics or df_continents not found in this kernel - run earlier cells first.")

# 2. Concatenate two frames and reset index
a = pd.DataFrame({'x':[1,2]})
b = pd.DataFrame({'x':[3,4]})
print("\nconcat and reset:\n", pd.concat([a,b], ignore_index=True))

# 3. Many-to-many example
left = pd.DataFrame({'k':[1,1],'v_left':[10,20]})
right = pd.DataFrame({'k':[1,1],'v_right':[100,200]})
print("\nmany-to-many join (cartesian product):\n", pd.merge(left, right, on='k'))

In [None]:
# 1. Merging Data
# First, ensure we have the olympic data loaded
df_olympics = pd.read_csv('all_olympic_medalists.csv')

# Create a small DataFrame to map codes to continents (just a sample)
continent_data = {
    'country_code': ['USA', 'CHN', 'GBR', 'AUS', 'CAN', 'FRA', 'GER', 'JPN'],
    'continent': ['North America', 'Asia', 'Europe', 'Oceania', 'North America', 'Europe', 'Europe', 'Asia']
}
df_continents = pd.DataFrame(continent_data)

# Merge the two DataFrames using 'country_code' as the key
# how='left' keeps all rows from the olympics data, even if we didn't map their continent
df_merged = pd.merge(df_olympics, df_continents, on='country_code', how='left')

# Check the result to see the new 'continent' column
df_merged[df_merged['continent'].notnull()].head()

In [None]:
# 2. Pivot Tables
# We want to see Countries as Rows, Years as Columns, and the Count of Medals as values
pivot_medals = df_olympics.pivot_table(
    index='country', 
    columns='year', 
    values='medal', 
    aggfunc='count', 
    fill_value=0
)

# Show a subset (first 5 countries, last 5 Olympic years)
pivot_medals.iloc[:5, -5:]

In [None]:
# 3. Time Series Handling
df_weather = pd.read_csv('pittsburgh-weather-2024.csv')

# The 'Day' column is just "Month Day" (e.g., "January 1"). 
# We need to add the year and convert to datetime.
df_weather['Date'] = pd.to_datetime(df_weather['Day'] + ', 2024', format='%B %d, %Y')

# Set the new Date column as the index
df_weather.set_index('Date', inplace=True)

# 4. Resampling
# 'M' stands for Month. We calculate the mean for every month.
monthly_temps = df_weather['High'].resample('M').mean()

monthly_temps

In [None]:
# 5. Visualization
# Plotting the resampled time series data
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
monthly_temps.plot(marker='o', color='orange', linestyle='-')

plt.title('Average Monthly High Temperature in Pittsburgh (2024)')
plt.ylabel('Temperature (°F)')
plt.xlabel('Month')
plt.grid(True)
plt.show()