In [1]:
# Filename: series_create.py
    
import pandas as pd

# Creating a Series without specifying an index
s1 = pd.Series([10, 20, 30, 40])
print(s1)

# Creating a Series with a custom index
s2 = pd.Series([10, 20, 30, 40], 
               index=['first', 'second', 'third', 'fourth'])
print(s2)

# Creating a Series from a dictionary
data = {'first': 10, 'second': 20, 'third': 30, 'fourth': 40}
s3 = pd.Series(data)
print(s3)

0    10
1    20
2    30
3    40
dtype: int64
first     10
second    20
third     30
fourth    40
dtype: int64
first     10
second    20
third     30
fourth    40
dtype: int64


In [3]:
# Filename: series_access_elements.py

import pandas as pd

# Create a series with specific index
s = pd.Series([10, 20, 30, 40], index=['first', 'second', 'third', 'fourth'])

# Access the element by label (element associated with index 'second')
e1 = s.loc['second']
print(e1)  # Print 20

# Access the element in position 2 (3rd element)
e2 = s.iloc[2]
print(e2)  # Print 30

20
30


In [4]:
# Filename: series_access_values_index.py

import pandas as pd

# Create a series with specific index
s = pd.Series([10, 20, 30, 40], index=['first', 'second', 'third', 'fourth'])
print(s.values)  # Return a NumPy array
print(s.index)   # Return an Index object

[10 20 30 40]
Index(['first', 'second', 'third', 'fourth'], dtype='object')


In [5]:
# Filename: series_assign_values.py        
import pandas as pd

# Create a series with specific index
s = pd.Series([10, 20, 30], index=['first', 'second', 'third'])
print(s.loc['first'])  # Access with explicit index
print(s.iloc[0])       # Access with implicit index
s.loc['second'] = 50   # Assign a new value
print(s)

10
10
first     10
second    50
third     30
dtype: int64


In [6]:
# Filename: series_slicing.py        
import pandas as pd

s = pd.Series([10, 20, 30, 40, 50, 60], 
       index=['first', 'second', 'third', 'fourth', 'fifth', 'sixth'])
# Slicing with explicit index (both included)              
print(s.loc['third':'fifth'])
# Slicing with implicit index 
# (start included and stop excluded)
print(s.iloc[2:5])

third     30
fourth    40
fifth     50
dtype: int64
third     30
fourth    40
fifth     50
dtype: int64


In [8]:
# Filename: series_masking.py        
import pandas as pd

s = pd.Series([10, 15, 25, 5, 30], 
              index=['a', 'b', 'c', 'd', 'e'])
mask = (s > 10) & (s < 30)  # Create a mask for values between 10 and 30
print(mask)
print(s[mask])  # Apply the mask to the series
s[mask] = 0  # Modify elements of s where mask is True
print(s)

a    False
b     True
c     True
d    False
e    False
dtype: bool
b    15
c    25
dtype: int64
a    10
b     0
c     0
d     5
e    30
dtype: int64


In [1]:
# Filename: series_fancy_indexing.py        
import pandas as pd

s = pd.Series([10, 20, 30, 40, 50], 
              index=['first', 'second', 'third', 'fourth', 'fifth'])
print(s.loc[['first', 'third']])  # Access indices 'first' and 'third'
print(s.iloc[0:2])             # Access positions 0 and 2

first    10
third    30
dtype: int64
first     10
second    20
dtype: int64


In [2]:
# DataFrame

# Powerful 2-dimensional data structure in Pandas, similar to spreadsheet or SQL table.
# Consists of rows and columns, each column can be of different data type.
# Each column have unique name, which allows for easy access and manipulation of data.

# Filename: dataframe_create1.py   
import pandas as pd

sales = pd.Series([100, 150, 200], index=['Product A', 'Product B', 'Product C'])
cost = pd.Series([80, 90, 120], index=['Product A', 'Product B', 'Product C'])
units_sold = pd.Series([20, 30, 15], index=['Product A', 'Product B', 'Product C'])

df = pd.DataFrame({'Sales': sales, 'Cost': cost, 'Units Sold': units_sold})
print(df)

           Sales  Cost  Units Sold
Product A    100    80          20
Product B    150    90          30
Product C    200   120          15


In [3]:
# Filename: dataframe_create2.py
import pandas as pd

sales = pd.Series([100, 150, 200, 250], 
           index=['Product A', 'Product B', 'Product C', 'Product D'])
cost = pd.Series([80, 90, 120], index=['Product A', 'Product B', 'Product C'])
units_sold = pd.Series([20, 30], index=['Product A', 'Product B'])
df = pd.DataFrame({'Sales': sales, 'Cost': cost, 'Units Sold': units_sold})
print(df)

           Sales   Cost  Units Sold
Product A    100   80.0        20.0
Product B    150   90.0        30.0
Product C    200  120.0         NaN
Product D    250    NaN         NaN


In [4]:
# Filename: dataframe_create3.py
import pandas as pd

data1 = [{'name': 'Emma', 'age': 28, 'city': 'Beijing'},
         {'name': 'Liam', 'age': 32, 'city': 'Shanghai'},
         {'name': 'Noah', 'age': 27, 'city': 'Guangzhou'}]
df1 = pd.DataFrame(data1)
print(df1)

data2 = [{'name': 'Sophie', 'age': 29, 'city': 'Shenzhen'},
         {'name': 'Oliver', 'age': 31, 'city': 'Chengdu'},
         {'name': 'Ava', 'age': 26, 'city': 'Hangzhou'}]
df2 = pd.DataFrame(data2, index=['row1', 'row2', 'row3'])
print(df2)

   name  age       city
0  Emma   28    Beijing
1  Liam   32   Shanghai
2  Noah   27  Guangzhou
        name  age      city
row1  Sophie   29  Shenzhen
row2  Oliver   31   Chengdu
row3     Ava   26  Hangzhou


In [5]:
# Filename: dataframe_create4.py
import pandas as pd

data_dict = {
    "Name": ["Emma", "Liam", "Noah"],
    "Age": [28, 32, 27],
    "City": ["Beijing", "Shanghai", "Guangzhou"]
}

df = pd.DataFrame(data_dict)
print(df)

   Name  Age       City
0  Emma   28    Beijing
1  Liam   32   Shanghai
2  Noah   27  Guangzhou


In [6]:
# Filename: dataframe_create5.py        
import numpy as np
import pandas as pd

# Create a 2D NumPy array with data related to cities in China
arr = np.array([[28, 32, 27],  # Ages
                 [1, 2, 3],    # IDs or similar identifiers
                 [1, 0, 1]])   # Some binary data (e.g., availability of a service)
                 
# Create a DataFrame from the array
df = pd.DataFrame(arr, columns=['Beijing', 'Shanghai', 'Guangzhou'], 
                  index=['Age', 'ID', 'Service Available'])
print(df)

                   Beijing  Shanghai  Guangzhou
Age                     28        32         27
ID                       1         2          3
Service Available        1         0          1


In [7]:
# Filename: dataframe_access1.py
import pandas as pd

data = {
    'Name': ['Emma', 'Liam', 'Noah'],
    'Age': [28, 32, 27],
    'City': ['Beijing', 'Shanghai', 'Guangzhou']
}

df = pd.DataFrame(data)
print(df.columns)  # Index object with column names
print(df.index)    # Index object with row indices

Index(['Name', 'Age', 'City'], dtype='object')
RangeIndex(start=0, stop=3, step=1)


In [9]:
# Filename: dataframe_access2.py    
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [30, 25, 35],
    'Salary': [70000, 80000, 120000]
}

df = pd.DataFrame(data)

arr = df.values  # Convert DataFrame to NumPy array
print(arr)
print(df)

[['Alice' 30 70000]
 ['Bob' 25 80000]
 ['Charlie' 35 120000]]
      Name  Age  Salary
0    Alice   30   70000
1      Bob   25   80000
2  Charlie   35  120000


In [10]:
# Filename: dataframe_access3.py    
import pandas as pd

temperature = pd.Series([22.5, 23.0, 19.5], index=['Monday', 'Tuesday', 'Wednesday'])
humidity = pd.Series([55, 60, 50], index=['Monday', 'Tuesday', 'Wednesday'])
wind_speed = pd.Series([5.5, 3.0, 7.2], index=['Monday', 'Tuesday', 'Wednesday'])
df = pd.DataFrame({'Temperature': temperature, 'Humidity': humidity, 
                   'Wind Speed': wind_speed})

print(df["Humidity"]) 

Monday       55
Tuesday      60
Wednesday    50
Name: Humidity, dtype: int64


In [12]:
# Filename: dataframe_access4.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'],
                  index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000],
                        index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], 
                  index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 
                   'Population': population, 
                   'Area (sq km)': area})
print(df.loc['A'])  # Access by label
print(df.iloc[1])   # Access by position 

City             Beijing
Population      21542000
Area (sq km)    16410.54
Name: A, dtype: object
City            Shanghai
Population      24183300
Area (sq km)      6340.5
Name: B, dtype: object


In [13]:
# Filename: dataframe_access5.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'], index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000], index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 'Population': population, 'Area (sq km)': area})

# Access columns from 'Population' to 'Area (sq km)' and rows from 'B' to 'C'                   
print(df.loc['B':'C', 'Population':'Area (sq km)'])

   Population  Area (sq km)
B    24183300        6340.5
C    14904000        7434.4


In [14]:
# Filename: dataframe_access6.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'], index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000], index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 'Population': population, 'Area (sq km)': area})

# Create a mask for cities with population greater than 
# 10 million and area less than 7000 sq km
mask = (df['Population'] > 10000000) & \
       (df['Area (sq km)'] < 7000)

# Print the filtered DataFrame based on the mask
print(df.loc[mask, 'Population':])

   Population  Area (sq km)
B    24183300        6340.5


In [1]:
# Filename: dataframe_access7.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'],
                 index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000],
                        index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], 
                  index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 'Population': population, 
                   'Area (sq km)': area})

# Print selected rows and columns
print(df.loc[['A', 'C'], ['City', 'Population']])

# Update selected rows and columns
df.loc[['A', 'C'], ['City', 'Population']] = ['Updated City', 0]
print(df.loc[['A', 'C'], ['City', 'Population']])

        City  Population
A    Beijing    21542000
C  Guangzhou    14904000
           City  Population
A  Updated City           0
C  Updated City           0


In [4]:
# Filename: dataframe_add_column.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'], index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000], index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 'Population': population, 'Area (sq km)': area})

# Adding a new column to indicate if the city is coastal
df['Is Coastal'] = pd.Series([False, True, False], index=['A', 'B', 'C'])
# The above is equivalent to df['Is Coastal'] = [False, True, False]                            
print(df) 

        City  Population  Area (sq km)  Is Coastal
A    Beijing    21542000      16410.54       False
B   Shanghai    24183300       6340.50        True
C  Guangzhou    14904000       7434.40       False


In [2]:
# Filename: dataframe_drop_column.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'],
                 index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000],
                        index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4], 
                  index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 'Population': population, 
                   'Area (sq km)': area})
print(df)  # Display the original DataFrame
# Drop the 'Population' and 'Area (sq km)' columns
df = df.drop(columns=['Population', 'Area (sq km)'])
print(df)  # Display the DataFrame after dropping columns

        City  Population  Area (sq km)
A    Beijing    21542000      16410.54
B   Shanghai    24183300       6340.50
C  Guangzhou    14904000       7434.40
        City
A    Beijing
B   Shanghai
C  Guangzhou


In [3]:
# Filename: dataframe_rename_column.py
import pandas as pd

city = pd.Series(['Beijing', 'Shanghai', 'Guangzhou'],
                  index=['A', 'B', 'C'])
population = pd.Series([21542000, 24183300, 14904000],
                        index=['A', 'B', 'C'])
area = pd.Series([16410.54, 6340.5, 7434.4],
                  index=['A', 'B', 'C'])
df = pd.DataFrame({'City': city, 
                   'Population': population, 
                   'Area (sq km)': area})

# Rename columns for brevity
df = df.rename(columns={'Population': 'Pop', 
                        'Area (sq km)': 'Area'})
print(df)

        City       Pop      Area
A    Beijing  21542000  16410.54
B   Shanghai  24183300   6340.50
C  Guangzhou  14904000   7434.40


In [7]:
# Filename: series_op.py
import pandas as pd

# Creating two Series with city populations
city_a = pd.Series([21540000, 10000000, 8000000], 
            index=['Shanghai', 'Beijing', 'Chongqing'])
city_b = pd.Series([1000000, 2000000, 3000000], 
            index=['Beijing', 'Chongqing', 'Guangzhou'])
result = city_a + city_b
print(result)


"""
Output:
Beijing      11000000.0
Chongqing    10000000.0
Guangzhou           NaN # Since the index elements which do not match are set to NaN (i.e. not a number)
Shanghai            NaN
dtype: float64
"""

Beijing      11000000.0
Chongqing    10000000.0
Guangzhou           NaN
Shanghai            NaN
dtype: float64


''

In [6]:
# Filename: dataframes_op.py
import pandas as pd

# Creating two DataFrames with city populations and areas
df1 = pd.DataFrame([[21540000, 2400.0], [10000000, 1687.0], 
                    [8000000, 315.0]],
                   columns=['Population', 'Area (sq mi)'],
                   index=['Shanghai', 'Beijing', 'Chongqing'])
print(df1)        
          
df2 = pd.DataFrame([[1000000, 743.0], [2000000, 400.0]],
                   columns=['Population', 'Area (sq mi)'],
                   index=['Beijing', 'Guangzhou'])
print(df2)

print(df1 + df2) # Adding the two DataFrames

           Population  Area (sq mi)
Shanghai     21540000        2400.0
Beijing      10000000        1687.0
Chongqing     8000000         315.0
           Population  Area (sq mi)
Beijing       1000000         743.0
Guangzhou     2000000         400.0
           Population  Area (sq mi)
Beijing    11000000.0        2430.0
Chongqing         NaN           NaN
Guangzhou         NaN           NaN
Shanghai          NaN           NaN


In [8]:
# Aggregations

# Filename: series_aggregations.py
import pandas as pd

# Creating a Series with city populations
populations = pd.Series([21540000, 10000000, 8000000],
                 index=['Shanghai', 'Beijing', 'Chongqing'])

# Calculating the mean population
mean_population = populations.mean()
print(f'Mean Population: {mean_population}')

Mean Population: 13180000.0


In [10]:
# Filename: dataframes_aggregations.py
import pandas as pd

# Creating a DataFrame with city population and area data
data = {
    'City': ['Shanghai', 'Beijing', 'Chongqing'],
    'Population': [21540000, 10000000, 8000000],
    'Area (sq mi)': [2400.0, 1687.0, 315.0]
}
df = pd.DataFrame(data)

# Calculating aggregate functions for the DataFrame
mean_values = df[['Population', 'Area (sq mi)']].mean()
print(mean_values)

Population      1.318000e+07
Area (sq mi)    1.467333e+03
dtype: float64


In [None]:
import pandas as pd

# Creating a Series with city populations
populations = pd.Series([21540000, 10000000, 8000000],
                 index=['Shanghai', 'Beijing', 'Chongqing'])

# Calculating the mean population
print(population.mean()) # Mean
print(population.std()) # Standard deviation
print(population.min()) # Minimum
print(population.max()) # Maximum
print(population.sum()) # Sum


20209766.666666668
4780950.361939908
14904000
24183300
60629300


In [19]:
# Missing values in Pandas are represented with sentinel values.
# They can be None / np.nan (floating point number)

# Check if a Series or a DataFrame contains missing values with the "isnull()" method.

# Filename: checking_missing.py
import numpy as np
import pandas as pd

# Creating a Series with null values for city populations
s = pd.Series([21540000, None, 8000000, np.nan], 
              index=['Shanghai', 'Beijing', 
                     'Chongqing', 'Guangzhou'])
# Checking for null values
null_mask = s.isnull()
print(null_mask)

print(s[null_mask])

Shanghai     False
Beijing       True
Chongqing    False
Guangzhou     True
dtype: bool
Beijing     NaN
Guangzhou   NaN
dtype: float64


In [20]:
# Filename: remove_missing.py
import numpy as np
import pandas as pd

# Creating a Series with missing values for city populations
s = pd.Series([21540000, None, 8000000, np.nan], 
              index=['Shanghai', 'Beijing', 
                     'Chongqing', 'Guangzhou'])
print("Original Series:")
print(s)
print("\nSeries after dropping missing values:")
print(s.dropna())
print()

# Creating a DataFrame with missing values for city statistics
df = pd.DataFrame({'Population': [21540000, 10000000, None], 
                   'Area (sq mi)': [2400.0, np.nan, 315.0]},
                  index=['Shanghai', 'Beijing', 'Chongqing'])
print("Original DataFrame:")
print(df)                  
print("\nDataFrame after dropping missing values:")
print(df.dropna())   

Original Series:
Shanghai     21540000.0
Beijing             NaN
Chongqing     8000000.0
Guangzhou           NaN
dtype: float64

Series after dropping missing values:
Shanghai     21540000.0
Chongqing     8000000.0
dtype: float64

Original DataFrame:
           Population  Area (sq mi)
Shanghai   21540000.0        2400.0
Beijing    10000000.0           NaN
Chongqing         NaN         315.0

DataFrame after dropping missing values:
          Population  Area (sq mi)
Shanghai  21540000.0        2400.0


In [None]:
# Filename: fill_missing_values.py
import numpy as np
import pandas as pd

# Create a Series with missing values
s = pd.Series([10, 20, None, 30, np.nan])
mean_value = s.mean()
print(mean_value) # (10 + 20 + 30) / 2 = 20 -> Output: 20
print(s.fillna(mean_value)) # Fill missing values with the mean of the column (i.e. Change None & np.nan to 20.0)
print()

# Create a DataFrame with missing values
df = pd.DataFrame({'Sales': [100, 200, 300], 
                   'Profit': [20, np.nan, 50]},
                  index=['Q1', 'Q2', 'Q3'])
mean_profit = df['Profit'].mean() # Change np.nan to 35 (i.e. the mean of their profit (20 + 50) / 2 = 35.0)
print(df.fillna(mean_profit)) 

20.0
0    10.0
1    20.0
2    20.0
3    30.0
4    20.0
dtype: float64

    Sales  Profit
Q1    100    20.0
Q2    200    35.0
Q3    300    50.0


In [22]:
# Filename: dataframe_grouping_data1.py
import pandas as pd

# Create a DataFrame with sales data
df = pd.DataFrame({'Region': ['North', 'South', 'North', 'South'],
                   'Sales': [250, 150, 300, 200],
                   'Profit': [50, 30, 70, 40]})
groupedDf = df.groupby('Region') # 2 groups: 'North' and 'South'                   

for key, groupDf in groupedDf:
    print(key)
    print(groupDf)

North
  Region  Sales  Profit
0  North    250      50
2  North    300      70
South
  Region  Sales  Profit
1  South    150      30
3  South    200      40


In [23]:
# Filename: dataframe_grouping_data2.py
import pandas as pd

# Create a DataFrame with employee data
df = pd.DataFrame({'Department': ['HR', 'IT', 'HR', 'IT'],
                   'Salary': [50000, 60000, 55000, 70000],
                   'Experience': [2, 5, 3, 7]})

# 2 groups: 'HR' and 'IT'                   
groupedDf = df.groupby('Department')

# Mean salary and experience, separately for each group
result = groupedDf.mean().reset_index()
print(result)

  Department   Salary  Experience
0         HR  52500.0         2.5
1         IT  65000.0         6.0


In [None]:
# File: employees.csv

"""
Name,Department,Salary,Experience
Alice,HR,60000,N/A
Bob,IT,75000,5
Charlie,Finance,80000,7
Diana,IT,90000,Missing
Eve,HR,N/A,3
"""

In [26]:
# Filename: csv_loading_data.py
import pandas as pd

# Load data from a CSV file with specific NA values
df = pd.read_csv('./programs/data/employees.csv', 
                 sep=',', 
                 skiprows=0,
                 na_values=['N/A', 'Missing'])
print(df)

      Name Department   Salary  Experience
0    Alice         HR  60000.0         NaN
1      Bob         IT  75000.0         5.0
2  Charlie    Finance  80000.0         7.0
3    Diana         IT  90000.0         NaN
4      Eve         HR      NaN         3.0


In [28]:
# Filename: csv_saving_data.py
import pandas as pd
import os

# Sample DataFrame with employee data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Salary': [60000, 75000, 80000, 90000, None],
    'Experience': [None, 5, 7, None, 3]
}
df = pd.DataFrame(data)

# Define the path to save the CSV file
path = os.path.join('./programs/data', 'employees2.csv')
df.to_csv(path, sep=',', index=False)

Here is a summary of the entire Pandas note, including key concepts and code examples:

---

## 1. Series Creation and Access



In [29]:
import pandas as pd

# Create Series from list
s1 = pd.Series([10, 20, 30, 40])
print(s1)

# Create Series with custom index
s2 = pd.Series([10, 20, 30, 40], index=['first', 'second', 'third', 'fourth'])
print(s2)

# Create Series from dictionary
data = {'first': 10, 'second': 20, 'third': 30, 'fourth': 40}
s3 = pd.Series(data)
print(s3)

0    10
1    20
2    30
3    40
dtype: int64
first     10
second    20
third     30
fourth    40
dtype: int64
first     10
second    20
third     30
fourth    40
dtype: int64




Access elements by label or position:



In [None]:
e1 = s2.loc['second']
e2 = s2.iloc[2]
print(e1, e2)



---

## 2. Series Slicing, Masking, and Fancy Indexing



In [None]:
# Slicing
print(s2.loc['second':'fourth'])
print(s2.iloc[1:3])

# Masking (Boolean indexing)
mask = (s2 > 10) & (s2 < 40)
print(s2[mask])

# Fancy indexing
print(s2.loc[['first', 'third']])



---

## 3. DataFrame Creation and Access



In [None]:
# Create DataFrame from dictionary of Series
df = pd.DataFrame({'Sales': [100, 150, 200], 'Cost': [80, 90, 120]})
print(df)

# Access columns and rows
print(df['Sales'])
print(df.iloc[0])
print(df.loc[0])



---

## 4. DataFrame Slicing and Filtering



In [None]:
# Row and column slicing
print(df.iloc[0:2, 0:1])

# Boolean filtering
mask = df['Sales'] > 120
print(df[mask])



---

## 5. DataFrame Column Operations



In [None]:
# Add new column
df['Profit'] = df['Sales'] - df['Cost']
print(df)

# Drop columns
df = df.drop(columns=['Cost'])
print(df)

# Rename columns
df = df.rename(columns={'Sales': 'Total Sales'})
print(df)



---

## 6. Series and DataFrame Arithmetic



In [None]:
city_a = pd.Series([21540000, 10000000, 8000000], index=['Shanghai', 'Beijing', 'Chongqing'])
city_b = pd.Series([1000000, 2000000, 3000000], index=['Beijing', 'Chongqing', 'Guangzhou'])
result = city_a + city_b
print(result)
# Output:
# Beijing      11000000.0
# Chongqing    10000000.0
# Guangzhou           NaN
# Shanghai            NaN



---

## 7. Aggregation and Statistics



In [None]:
populations = pd.Series([21540000, 10000000, 8000000], index=['Shanghai', 'Beijing', 'Chongqing'])
print(populations.mean())  # Mean
print(populations.std())   # Standard deviation
print(populations.min())   # Minimum
print(populations.max())   # Maximum
print(populations.sum())   # Sum



---

## 8. Handling Missing Values



In [None]:
import numpy as np

s = pd.Series([21540000, None, 8000000, np.nan], index=['Shanghai', 'Beijing', 'Chongqing', 'Guangzhou'])
print(s.isnull())
print(s.dropna())
print(s.fillna(s.mean()))



---

## 9. Grouping and Aggregation



In [None]:
df = pd.DataFrame({'Department': ['HR', 'IT', 'HR', 'IT'],
                   'Salary': [50000, 60000, 55000, 70000],
                   'Experience': [2, 5, 3, 7]})
grouped = df.groupby('Department')
print(grouped.mean())



---

## 10. CSV File Reading and Writing



In [None]:
# Read CSV
df = pd.read_csv('./programs/data/employees.csv', na_values=['N/A', 'Missing'])
print(df)

# Save CSV
df.to_csv('./programs/data/employees2.csv', index=False)



Example CSV content:





---

## 11. Other DataFrame Operations

- Add columns (e.g., `df['Is Coastal'] = [False, True, False]`)
- Drop columns (`df.drop(columns=['Population', 'Area (sq km)'])`)
- Rename columns (`df.rename(columns={'Population': 'Pop'})`)
- Access by label and position (`df.loc['A']`, `df.iloc[1]`)
- Slicing rows and columns (`df.loc['B':'C', 'Population':'Area (sq km)']`)
- Boolean mask filtering (`df[(df['Population'] > 10000000) & (df['Area (sq km)'] < 7000)]`)
- Groupby and aggregation (`df.groupby('Region').mean()`)

---

**Summary:**  
This notebook covers the basics of pandas Series and DataFrame creation, access, slicing, arithmetic, aggregation, missing value handling, grouping, and CSV file operations, with practical code examples for each topic.

In [None]:
# The above is generated by Github Copilot, just for reference only