In [3]:
import pandas as pd
import numpy as np

print("="*60)
print(" LOADING AND SAVING DATA")
print("="*60)

# CREATE SAMPLE DATA FILE
# In real work, you'd have actual CSV files from your business

# Create sample customer data
np.random.seed(42)
sample_data = pd.DataFrame({
    'CustomerID': range(1, 51),
    'Name': [f'Customer{i}' for i in range(1, 51)],
    'Age': np.random.randint(18, 70, 50),
    'MonthlySpend': np.random.uniform(20, 500, 50).round(2),
    'AccountAge': np.random.randint(1, 60, 50),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], 50),
    'Churned': np.random.choice([0, 1], 50, p=[0.8, 0.2])
})

# Save to CSV
sample_data.to_csv('customer_data.csv', index=False)
print("‚úì Created 'customer_data.csv'")

# LOADING DATA
df = pd.read_csv('customer_data.csv')
print(f"\nüìÇ Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
print(df.head())

# SAVING DATA (different formats)
df.to_csv('output.csv', index=False)  # CSV
df.to_excel('output.xlsx', index=False)  # Excel (requires openpyxl)
df.to_json('output.json')  # JSON
print("\n‚úì Saved in multiple formats")

# Reading with options
df_custom = pd.read_csv('customer_data.csv', 
                        usecols=['CustomerID', 'Age', 'MonthlySpend'],  # Only these columns
                        nrows=10)  # Only first 10 rows
print(f"\nüìä Custom load (3 columns, 10 rows):")
print(df_custom)

 LOADING AND SAVING DATA
‚úì Created 'customer_data.csv'

üìÇ Loaded data: 50 rows, 7 columns
   CustomerID       Name  Age  MonthlySpend  AccountAge Region  Churned
0           1  Customer1   56        483.50          23   West        0
1           2  Customer2   69        408.03          24  South        0
2           3  Customer3   46        166.21          37  South        0
3           4  Customer4   32         66.88          35  North        0
4           5  Customer5   60        348.43          44  South        0

‚úì Saved in multiple formats

üìä Custom load (3 columns, 10 rows):
   CustomerID  Age  MonthlySpend
0           1   56        483.50
1           2   69        408.03
2           3   46        166.21
3           4   32         66.88
4           5   60        348.43
5           6   25        231.27
6           7   38         78.58
7           8   56        257.68
8           9   36         36.51
9          10   40        456.47


In [1]:
import pandas as pd
import numpy as np

# Load our sample data
df = pd.read_csv('customer_data.csv')

print("="*60)
print(" DATA SELECTION AND FILTERING")
print("="*60)

# 1. SELECT COLUMNS
print("\n1Ô∏è‚É£ Selecting Columns:")

# Single column (returns Series)
ages = df['Age']
print(f"Ages (Series): {ages.head()}")

# Multiple columns (returns DataFrame)
subset = df[['CustomerID', 'Name', 'MonthlySpend']]
print(f"\nSubset:\n{subset.head()}")

# 2. SELECT ROWS (by position)
print("\n2Ô∏è‚É£ Selecting Rows:")

# First 5 rows
first_five = df.iloc[0:5]
print(f"First 5 rows:\n{first_five}")

# Specific rows by position
specific = df.iloc[[0, 5, 10]]  # Rows 0, 5, and 10
print(f"\nSpecific rows (0, 5, 10):\n{specific}")

# 3. FILTERING (Boolean indexing)
print("\n3Ô∏è‚É£ Filtering Data:")

# Customers over 40
older_customers = df[df['Age'] > 40]
print(f"Customers over 40: {len(older_customers)} customers")
print(older_customers.head())

# High-value customers (spend > $300)
high_value = df[df['MonthlySpend'] > 300]
print(f"\nHigh-value customers: {len(high_value)}")
print(high_value[['Name', 'MonthlySpend', 'Region']].head())

# Multiple conditions: Young AND high-value
young_high_value = df[(df['Age'] < 30) & (df['MonthlySpend'] > 300)]
print(f"\nYoung (<30) high-value customers: {len(young_high_value)}")

# OR condition: North OR South region
north_south = df[(df['Region'] == 'North') | (df['Region'] == 'South')]
print(f"\nNorth or South customers: {len(north_south)}")

# Using .isin() for multiple values
east_west = df[df['Region'].isin(['East', 'West'])]
print(f"East or West customers: {len(east_west)}")

# 4. SELECTING ROWS AND COLUMNS TOGETHER
print("\n4Ô∏è‚É£ Rows + Columns:")

# Specific rows and columns
result = df.loc[df['Age'] > 50, ['Name', 'Age', 'MonthlySpend']]
print(f"Customers over 50 (selected columns):\n{result.head()}")

 DATA SELECTION AND FILTERING

1Ô∏è‚É£ Selecting Columns:
Ages (Series): 0    56
1    69
2    46
3    32
4    60
Name: Age, dtype: int64

Subset:
   CustomerID       Name  MonthlySpend
0           1  Customer1        483.50
1           2  Customer2        408.03
2           3  Customer3        166.21
3           4  Customer4         66.88
4           5  Customer5        348.43

2Ô∏è‚É£ Selecting Rows:
First 5 rows:
   CustomerID       Name  Age  MonthlySpend  AccountAge Region  Churned
0           1  Customer1   56        483.50          23   West        0
1           2  Customer2   69        408.03          24  South        0
2           3  Customer3   46        166.21          37  South        0
3           4  Customer4   32         66.88          35  North        0
4           5  Customer5   60        348.43          44  South        0

Specific rows (0, 5, 10):
    CustomerID        Name  Age  MonthlySpend  AccountAge Region  Churned
0            1   Customer1   56        483.50   