Python for Data Analysis

In [1]:
# A common use of variable unpacking is iterating over sequences of tuples or lists:
seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
for a, b, c in seq:
    print(f'a={a}, b={b}, c={c}')


a=1, b=2, c=3
a=4, b=5, c=6
a=7, b=8, c=9


In [None]:

# Elements can be removed by value using remove(), which removes the first occurrence of the value
b_list = []  # Initialize empty list
b_list.append('foo')
b_list.extend(['red', 'baz', 'dwarf', 'foo'])
print(b_list)  # ['foo', 'red', 'baz', 'dwarf', 'foo']

b_list.remove('foo')  # Removes first occurrence of 'foo'
print(b_list)  # ['red', 'baz', 'dwarf', 'foo']

# List membership check with 'in' and 'not in' keywords
print('dwarf' in b_list)  # True
print('dwarf' not in b_list)  # False

# List membership is slower than checking with dicts and sets, as lists use linear search while dicts/sets use hash tables (constant time).


['foo', 'red', 'baz', 'dwarf', 'foo']
['red', 'baz', 'dwarf', 'foo']
True
False


In [3]:
# Concatenating lists with + creates a new list
print([4, None, 'foo'] + [7, 8, (2, 3)])  # [4, None, 'foo', 7, 8, (2, 3)]

# Use extend() to append multiple elements to an existing list
x = [4, None, 'foo']
x.extend([7, 8, (2, 3)])
print(x)  # [4, None, 'foo', 7, 8, (2, 3)]

# Example: list_of_lists containing multiple lists
list_of_lists = [[1, 2], [3, 4], [5, 6]]

# extend() is more efficient than concatenation with + for large lists
everything = []
for chunk in list_of_lists:
    everything.extend(chunk)

print(everything)  # [1, 2, 3, 4, 5, 6]

# Avoid this if performance is a concern
everything = []
for chunk in list_of_lists:
    everything = everything + chunk

print(everything)  # [1, 2, 3, 4, 5, 6]

# To summarize:

# insert() is used for adding elements to the beginning of a list (or at any specific position).

# extend() is used for adding elements to the end of a list

[4, None, 'foo', 7, 8, (2, 3)]
[4, None, 'foo', 7, 8, (2, 3)]
[1, 2, 3, 4, 5, 6]
[1, 2, 3, 4, 5, 6]


In [4]:
# Sorting a list in place using sort():
a = [7, 2, 5, 1, 3]
a.sort()  # Sorts the list in place
print(a)  # Output: [1, 2, 3, 5, 7]

# Sorting with a custom key (e.g., sorting by string length):
b = ['saw', 'small', 'He', 'foxes', 'six']
b.sort(key=len)  # Sorts by length of each string
print(b)  # Output: ['He', 'saw', 'six', 'small', 'foxes']

# Note: sort() modifies the list in place, and we can use a custom key for sorting.
# The sorted() function (coming soon) creates a sorted copy of a sequence without modifying the original list.


[1, 2, 3, 5, 7]
['He', 'saw', 'six', 'small', 'foxes']


In [5]:
# Binary search and maintaining a sorted list using the bisect module
import bisect

# Example list
c = [1, 2, 2, 2, 3, 4, 7]

# bisect.bisect finds the position to insert an element to keep the list sorted
print(bisect.bisect(c, 2))  # Output: 4 (position to insert 2)
print(bisect.bisect(c, 5))  # Output: 6 (position to insert 5)

# bisect.insort inserts the element at the correct position to maintain sorted order
bisect.insort(c, 6)
print(c)  # Output: [1, 2, 2, 2, 3, 4, 6, 7]

# Important Note: The bisect functions do not check if the list is already sorted.
# Using them on an unsorted list will work, but may give incorrect results.

4
6
[1, 2, 2, 2, 3, 4, 6, 7]


In [6]:
# Slicing in Python: Selecting sections of sequences like lists

# Basic slicing (start:stop)
seq = [7, 2, 3, 7, 5, 6, 0, 1]
print(seq[1:5])  # Output: [2, 3, 7, 5]

# Slicing can also be assigned to
seq[3:4] = [6, 3]
print(seq)  # Output: [7, 2, 3, 6, 3, 5, 6, 0, 1]

# Slicing without start or stop
print(seq[:5])  # Output: [7, 2, 3, 6, 3]
print(seq[3:])  # Output: [6, 3, 5, 6, 0, 1]

# Negative indices slice from the end
print(seq[-4:])  # Output: [5, 6, 0, 1]
print(seq[-6:-2])  # Output: [6, 3, 5, 6]

# Step in slicing, e.g., taking every other element
print(seq[::2])  # Output: [7, 3, 3, 6, 1]

# Reverse a sequence using step -1
print(seq[::-1])  # Output: [1, 0, 6, 5, 3, 6, 3, 2, 7]


[2, 3, 7, 5]
[7, 2, 3, 6, 3, 5, 6, 0, 1]
[7, 2, 3, 6, 3]
[6, 3, 5, 6, 0, 1]
[5, 6, 0, 1]
[6, 3, 5, 6]
[7, 3, 3, 6, 1]
[1, 0, 6, 5, 3, 6, 3, 2, 7]


In [7]:
# Using enumerate to track index while iterating

# Without enumerate, you'd manually track the index
i = 0
for value in collection:
    # do something with value
    i += 1

# With enumerate, it returns index-value pairs
for i, value in enumerate(collection):
    # do something with value

# Example: Mapping list values to their indices using enumerate
some_list = ['foo', 'bar', 'baz']
mapping = {}

for i, v in enumerate(some_list):
    mapping[v] = i

print(mapping)  # Output: {'foo': 0, 'bar': 1, 'baz': 2}


IndentationError: expected an indented block after 'for' statement on line 10 (547651295.py, line 14)

In [None]:
# The sorted function returns a new sorted list from the elements of any sequence.

# Example: Sorting a list of numbers
sorted_list = sorted([7, 1, 2, 6, 0, 3, 2])
print(sorted_list)  # Output: [0, 1, 2, 2, 3, 6, 7]

# Example: Sorting a string (returns list of characters sorted)
sorted_string = sorted('horse race')
print(sorted_string)  # Output: [' ', 'a', 'c', 'e', 'e', 'h', 'o', 'r', 'r', 's']

# sorted function has the same arguments as the sort method on lists, such as:
# key= (for sorting by a custom criteria) and reverse= (for descending order).


In [None]:
# The zip function pairs up the elements of multiple sequences (lists, tuples, etc.) to create tuples.
# It creates an iterator of tuples.

# Example: zipping two lists
seq1 = ['foo', 'bar', 'baz']
seq2 = ['one', 'two', 'three']
zipped = zip(seq1, seq2)
print(list(zipped))  # Output: [('foo', 'one'), ('bar', 'two'), ('baz', 'three')]

# zip can handle an arbitrary number of sequences, and the number of tuples produced
# is determined by the shortest sequence:
seq3 = [False, True]
print(list(zip(seq1, seq2, seq3)))  # Output: [('foo', 'one', False), ('bar', 'two', True)]

# A common use of zip is to iterate over multiple sequences simultaneously:
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print(f'{i}: {a}, {b}')
# Output:
# 0: foo, one
# 1: bar, two
# 2: baz, three

# "Unzipping" a sequence of tuples:
pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'), ('Schilling', 'Curt')]
first_names, last_names = zip(*pitchers)
print(first_names)  # Output: ('Nolan', 'Roger', 'Schilling')
print(last_names)   # Output: ('Ryan', 'Clemens', 'Curt')


In [None]:
# The reversed function iterates over the elements of a sequence in reverse order.
# It returns a generator, which doesn't create the reversed sequence until materialized.

# Example: using reversed on a range
print(list(reversed(range(10))))  # Output: [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

# reversed can also be used in a for loop:
for num in reversed(range(5)):
    print(num)
# Output:
# 4
# 3
# 2
# 1
# 0

In [None]:
# Creating a dictionary
empty_dict = {}
d1 = {'a': 'some value', 'b': [1, 2, 3, 4]}

# Accessing elements using keys
print(d1['a'])  # Output: 'some value'
print(d1['b'])  # Output: [1, 2, 3, 4]

# Adding new key-value pairs
d1[7] = 'an integer'
print(d1)  # Output: {'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer'}

# Checking if a key exists in the dictionary
print('b' in d1)  # Output: True

# Deleting elements using `del` or `pop`
del d1[7]  # Removes the key-value pair for key 7
print(d1)  # Output: {'a': 'some value', 'b': [1, 2, 3, 4]}

# Using pop to delete and get the value
ret = d1.pop('b')  # Removes and returns the value associated with key 'b'
print(ret)  # Output: [1, 2, 3, 4]
print(d1)  # Output: {'a': 'some value'}

# Getting keys and values as lists
print(list(d1.keys()))  # Output: ['a']
print(list(d1.values()))  # Output: ['some value']

# Merging dictionaries using `update`
d1.update({'b': 'foo', 'c': 12})
print(d1)  # Output: {'a': 'some value', 'b': 'foo', 'c': 12}

key_list = ['a', 'b', 'c', 'd'] # loop with zip
value_list = [1, 2, 3, 4]

mapping = {}
for key, value in zip(key_list, value_list):
    mapping[key] = value

print(mapping)  # Output: {'a': 1, 'b': 2, 'c': 3, 'd': 4}

mapping = dict(zip(range(5), reversed(range(5)))) # dict with zip
print(mapping)  # Output: {0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

In [None]:
from collections import defaultdict

# Example list of words
words = ['apple', 'bat', 'bar', 'atom', 'book']

# Using a standard dictionary with get() and setdefault()
by_letter = {}

# Loop through words and categorize them by the first letter
for word in words:
    letter = word[0]
    # Using get() to check for the letter and provide a default value (empty list)
    by_letter[letter] = by_letter.get(letter, []) + [word]

print("Using get():")
print(by_letter)  # Output: {'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

# Another approach with setdefault()
by_letter = {}
for word in words:
    letter = word[0]
    # Using setdefault() to initialize the list if not already present
    by_letter.setdefault(letter, []).append(word)

print("\nUsing setdefault():")
print(by_letter)  # Output: {'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

# Using defaultdict from collections
by_letter = defaultdict(list)
for word in words:
    by_letter[word[0]].append(word)

print("\nUsing defaultdict():")
print(dict(by_letter))  # Output: {'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

# Example using pop() with a default value
by_letter = {'a': ['apple'], 'b': ['bat']}
letter = 'c'
# pop() will return the default value if the key does not exist
result = by_letter.pop(letter, 'No words found')
print(f"\nUsing pop() with default value for '{letter}': {result}")


In [None]:
# 1. Creating and Modifying Dictionaries
d = {'a': 'some value', 'b': [1, 2, 3, 4]}
print(d)

# Modify or add new elements
d['new_key'] = 'new_value'
print(d)

# Deleting elements
del d['a']
print(d)

# Using pop (removes and returns the value)
value = d.pop('b')
print(f"Popped value: {value}")
print(d)

# 2. Checking for Key Existence
print('new_key' in d)  # Returns True if 'new_key' is in the dictionary

# 3. Accessing Keys and Values
print(list(d.keys()))   # List of all keys
print(list(d.values())) # List of all values

# 4. Default Values with get()
default_value = d.get('nonexistent_key', 'default_value')
print(f"Default value for nonexistent_key: {default_value}")

# 5. Creating Dictionaries from Sequences
keys = ['a', 'b', 'c']
values = [1, 2, 3]
mapping = dict(zip(keys, values))
print(mapping)

# 6. Using setdefault() to Set Default Values
by_letter = {}
words = ['apple', 'bat', 'bar', 'atom', 'book']
for word in words:
    letter = word[0]
    by_letter.setdefault(letter, []).append(word)
print(by_letter)

# 7. defaultdict for automatic default values
from collections import defaultdict
by_letter_default = defaultdict(list)
for word in words:
    by_letter_default[word[0]].append(word)
print(by_letter_default)

# 8. Valid Dict Key Types (Hashability)
try:
    # Using an unhashable list as a key
    invalid_dict = {}
    invalid_dict[[1, 2, 3]] = 'value'  # This will raise an error
except TypeError as e:
    print(f"Error: {e}")

# 9. Checking if an object is hashable
print(hash('string'))  # Valid hashable object
print(hash((1, 2, (2, 3))))  # Valid hashable tuple
try:
    print(hash((1, 2, [2, 3])))  # This will fail because lists are mutable
except TypeError as e:
    print(f"Error: {e}")


In [None]:
# Creating sets
set_example = set([2, 2, 2, 1, 3, 3])  # {1, 2, 3}
print("Set created using set function:", set_example)

set_literal = {2, 2, 2, 1, 3, 3}  # {1, 2, 3}
print("Set created using set literal:", set_literal)

# Set Operations
a = {1, 2, 3, 4, 5}
b = {3, 4, 5, 6, 7, 8}

# Union (a ∪ b)
union_result = a.union(b)
print("Union (a.union(b)):", union_result)

union_result_operator = a | b
print("Union (a | b):", union_result_operator)

# Intersection (a ∩ b)
intersection_result = a.intersection(b)
print("Intersection (a.intersection(b)):", intersection_result)

intersection_result_operator = a & b
print("Intersection (a & b):", intersection_result_operator)

# Difference (a - b)
difference_result = a.difference(b)
print("Difference (a.difference(b)):", difference_result)

difference_result_operator = a - b
print("Difference (a - b):", difference_result_operator)

# Symmetric Difference (a Δ b)
symmetric_difference_result = a.symmetric_difference(b)
print("Symmetric Difference (a.symmetric_difference(b)):", symmetric_difference_result)

symmetric_difference_result_operator = a ^ b
print("Symmetric Difference (a ^ b):", symmetric_difference_result_operator)

# Subset and Superset
is_subset = {1, 2, 3}.issubset(a)
print("Is {1, 2, 3} a subset of a?", is_subset)

is_superset = a.issuperset({1, 2, 3})
print("Is a a superset of {1, 2, 3}?", is_superset)

# Adding and Removing Elements
a.add(6)  # Adding an element to set a
print("After adding 6:", a)

a.remove(6)  # Removing an element from set a
print("After removing 6:", a)

# Set Copy and Update
c = a.copy()  # Copying a set
c |= b        # Union of c and b
print("Union of c and b (c |= b):", c)

d = a.copy()  # Copying a set
d &= b        # Intersection of d and b
print("Intersection of d and b (d &= b):", d)

# Working with immutable elements (hashable types)
# Tuples can be used as set elements, while lists cannot because they are mutable
my_data = [1, 2, 3, 4]
my_set = {tuple(my_data)}  # Convert list to tuple for immutability
print("Set containing tuple:", my_set)

# Checking equality of sets
are_sets_equal = {1, 2, 3} == {3, 2, 1}
print("Are {1, 2, 3} and {3, 2, 1} equal?", are_sets_equal)

# Pop an element (removes arbitrary element from the set)
popped_element = a.pop()
print(f"Popped element: {popped_element}, Remaining set: {a}")

# List Comprehension Example
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']

# Convert strings to uppercase if their length is greater than 2
upper_strings = [x.upper() for x in strings if len(x) > 2]
print("List Comprehension (Uppercase strings with length > 2):", upper_strings)

# Set Comprehension Example
# Create a set of unique string lengths
unique_lengths = {len(x) for x in strings}
print("Set Comprehension (Unique string lengths):", unique_lengths)

# Alternative approach using map function (equivalent to set comprehension)
unique_lengths_map = set(map(len, strings))
print("Set using map function:", unique_lengths_map)

# Dict Comprehension Example
# Create a dictionary that maps each string to its index in the original list
loc_mapping = {val: index for index, val in enumerate(strings)}
print("Dict Comprehension (String to index mapping):", loc_mapping)



In [None]:
# Nested List Comprehensions in Python

# Suppose we have a list of lists containing some English and Spanish names:
all_data = [['John', 'Emily', 'Michael', 'Mary', 'Steven'],
            ['Maria', 'Juan', 'Javier', 'Natalia', 'Pilar']]

# You might have gotten these names from a couple of files and decided to organize them by language.
# Now, suppose we wanted to get a single list containing all names with two or more 'e's in them.

# We could do this with a simple for loop:
names_of_interest = []
for names in all_data:
    enough_es = [name for name in names if name.count('e') >= 2]
    names_of_interest.extend(enough_es)

# Alternatively, we can wrap this whole operation up in a single nested list comprehension:
result = [name for names in all_data for name in names if name.count('e') >= 2]
print(result)  # Output: ['Steven']

# At first, nested list comprehensions are a bit hard to wrap your head around.
# The 'for' parts of the list comprehension are arranged according to the order of nesting,
# and any filter condition is put at the end as before.

# Here is another example where we "flatten" a list of tuples of integers into a simple list of integers:
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]

# Using nested list comprehension to flatten the list of tuples:
flattened = [x for tup in some_tuples for x in tup]
print(flattened)  # Output: [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Keep in mind that the order of the 'for' expressions would be the same if you wrote a nested for loop:
flattened_loop = []
for tup in some_tuples:
    for x in tup:
        flattened_loop.append(x)
print(flattened_loop)  # Output: [1, 2, 3, 4, 5, 6, 7, 8, 9]

# You can have arbitrarily many levels of nesting, though if you have more than two or three levels of nesting,
# you should probably start to question whether this makes sense from a code readability standpoint.

# It’s important to distinguish the syntax just shown from a list comprehension inside a list comprehension,
# which is also perfectly valid:
nested_result = [[x for x in tup] for tup in some_tuples]
print(nested_result)  # Output: [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

# This produces a list of lists, rather than a flattened list of all of the inner elements.

# Key Takeaways:
# 1. Nested list comprehensions are concise but can become hard to read with too many levels of nesting.
# 2. The order of 'for' clauses follows the same order as nested 'for' loops.
# 3. Filter conditions are placed at the end of the comprehension.
# 4. Use nested list comprehensions for simple transformations and flattening, but prefer traditional loops for complex logic.

In [None]:
# 3.2 Functions in Python

# Functions are the primary and most important method of code organization and reuse in Python.
# As a rule of thumb, if you anticipate needing to repeat the same or very similar code more than once,
# it may be worth writing a reusable function. Functions also improve readability by giving a name to a group of statements.

# Functions are declared with the `def` keyword and returned from with the `return` keyword:
def my_function(x, y, z=1.5):
    """
    A simple function that performs a calculation based on the values of x, y, and z.
    - x, y: positional arguments
    - z: keyword argument with a default value of 1.5
    """
    if z > 1:
        return z * (x + y)  # Return the product of z and (x + y) if z > 1
    else:
        return z / (x + y)  # Return the division of z by (x + y) if z <= 1

# There is no issue with having multiple return statements.
# If Python reaches the end of a function without encountering a return statement, `None` is returned automatically.

# Example calls to the function:
result1 = my_function(5, 6, z=0.7)  # z is explicitly passed as 0.7
result2 = my_function(3.14, 7, 3.5)  # z is passed as 3.5 (positional)
result3 = my_function(10, 20)        # z uses the default value of 1.5

print(result1)  # Output: 0.7 / (5 + 6) = 0.0636...
print(result2)  # Output: 3.5 * (3.14 + 7) = 35.49
print(result3)  # Output: 1.5 * (10 + 20) = 45.0

# Each function can have positional arguments and keyword arguments.
# Keyword arguments are most commonly used to specify default values or optional arguments.
# In the function above:
# - x and y are positional arguments.
# - z is a keyword argument with a default value of 1.5.

# The main restriction on function arguments is that keyword arguments must follow positional arguments (if any).
# However, you can specify keyword arguments in any order:
result4 = my_function(x=5, y=6, z=7)  # Using keywords for all arguments
result5 = my_function(y=6, x=5, z=7)  # Order of x and y doesn't matter when using keywords

print(result4)  # Output: 7 * (5 + 6) = 77
print(result5)  # Output: 7 * (5 + 6) = 77

# It is also possible to use keywords for passing positional arguments:
result6 = my_function(x=5, y=6)  # z uses the default value of 1.5
print(result6)  # Output: 1.5 * (5 + 6) = 16.5

# Using keywords for positional arguments can sometimes improve readability,
# especially when dealing with functions that have many arguments.

# Key Takeaways:
# 1. Use functions to organize and reuse code.
# 2. Positional arguments must come before keyword arguments.
# 3. Keyword arguments can be specified in any order and are often used for optional parameters.
# 4. Functions can have multiple return statements, and `None` is returned if no return statement is reached.
# 5. Using keywords for arguments can improve readability, especially in functions with many parameters.

CHAPTER 4 - NumPy Basics: Arrays and Vectorized Computation

In [8]:
# NumPy Fundamentals: High-Performance Array Computing
import numpy as np
import timeit

# 1. Creating Arrays ----------------------------------------------------------
# Basic array creation
arr = np.array([1, 2, 3, 4, 5])          # 1D array
matrix = np.array([[1, 2], [3, 4]])       # 2D array
zeros = np.zeros((3, 3))                  # 3x3 zero matrix
ranged = np.arange(0, 10, 0.5)            # Like range() but with decimals

# 2. Vectorized Operations ----------------------------------------------------
# Fast element-wise operations without loops
arr = np.arange(1, 6)
squares = arr ** 2                         # [1, 4, 9, 16, 25]
sqrt_matrix = np.sqrt(matrix)              # Element-wise square root

# 3. Performance Benchmark vs Python Lists ------------------------------------
size = 1_000_000
np_arr = np.arange(size)
py_list = list(range(size))

# Vectorized operation
%timeit -n 100 np_arr * 2                 # Typical result: ~2ms per loop

# List comprehension equivalent
%timeit -n 100 [x*2 for x in py_list]     # Typical result: ~50ms per loop

# 4. Key Features -------------------------------------------------------------
# Memory efficiency
print("NumPy array size:", np_arr.nbytes / 1e6, "MB")      # ~8MB (int64)
print("Python list size: ", sys.getsizeof(py_list)/1e6, "MB") # ~9MB + element overhead

# Broadcasting example
a = np.array([[1], [10], [100]])
b = np.array([1, 2, 3])
print(a + b)  # [[2, 3, 4], [11,12,13], [101,102,103]]

# 5. Core Advantages ----------------------------------------------------------
"""
1. Contiguous Memory: Data stored in single memory block for CPU-friendly access
2. Vectorization: Operations applied to entire arrays (C-optimized backend)
3. Broadcasting: Smart handling of different shaped arrays
4. UFuncs: Fast mathematical operations (np.sin, np.exp, etc)
5. Memory Efficiency: 4-10x less memory than Python lists for numbers
6. Ecosystem Foundation: Used by Pandas, SciPy, Scikit-learn, etc
"""

# 6. Common Operations -------------------------------------------------------
# Aggregations
print("Mean:", np.mean(np_arr))
print("Max:", np.max(np_arr))

# Filtering
filtered = np_arr[np_arr > 500_000]

# Reshaping
matrix_3d = np_arr.reshape((100, 100, 100))

# 7. Real-world Use Case ------------------------------------------------------
# Image processing example (3D array: height × width × RGB channels)
fake_image = np.random.randint(0, 256, (1080, 1920, 3), dtype=np.uint8)
grayscale = fake_image.mean(axis=2)  # Convert to grayscale in one operation

3.79 ms ± 70.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
69 ms ± 728 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
NumPy array size: 8.0 MB


NameError: name 'sys' is not defined

In [None]:
# NumPy ndarray: Multidimensional Array Fundamentals
import numpy as np

# 1. Creating ndarrays --------------------------------------------------------
# From Python list (notice automatic type conversion)
data = np.array([[1, 2, 3], [4, 5.5, 6]])  # ints get converted to floats
print("2D array:\n", data)
print("Shape:", data.shape)  # (2, 3)
print("Data type:", data.dtype)  # float64

# Specialized array creation
zeros = np.zeros((2, 4))        # 2x4 array of 0s
ones = np.ones((3, 2))          # 3x2 array of 1s 
rand_array = np.random.randn(2, 3)  # 2x3 normal distribution

# 2. Key Characteristics ------------------------------------------------------
"""
- Homogeneous: All elements same type (try mixing types to see automatic upcasting)
- Fixed size: Size can't change without making new array
- Contiguous memory: Enables vectorized operations
- Optimized operations: Implemented in C for speed
"""

# 3. Vectorized Operations ----------------------------------------------------
# Element-wise operations (no loops!)
print("\nOriginal array:\n", rand_array)
print("\nMultiply by 10:\n", rand_array * 10)
print("\nAdd arrays:\n", rand_array + rand_array)
print("\nExponential:\n", np.exp(rand_array))

# 4. Performance Demonstration ------------------------------------------------
large_arr = np.arange(1_000_000)
large_list = list(range(1_000_000))

# Vectorized operation
%timeit -n 100 large_arr * 2  # ~2ms on typical hardware

# Python loop equivalent
%timeit -n 100 [x*2 for x in large_list]  # ~50ms - 25x slower!

# 5. Important Attributes -----------------------------------------------------
arr = np.array([[1, 2], [3, 4], [5, 6]])
print("\nArray attributes:")
print("Dimensions:", arr.ndim)     # 2
print("Shape:", arr.shape)        # (3, 2)
print("Data type:", arr.dtype)    # int64
print("Total elements:", arr.size) # 6
print("Memory usage:", arr.nbytes, "bytes")  # 48 bytes (6 elements * 8 bytes each)

# 6. Type Management ----------------------------------------------------------
# Explicit type specification
int_array = np.array([1.5, 2.7, 3.9], dtype=np.int32)  # Truncates to integers
print("\nType conversion:", int_array)  # [1 2 3]

# 7. Why Homogeneous Matters --------------------------------------------------
mixed_array = np.array([1, 2.5, '3'])  # All elements become strings
print("\nType coercion:", mixed_array.dtype)  # <U32 (Unicode string type)

# 8. Best Practices -----------------------------------------------------------
"""
- Always use np.array() instead of Python lists for numerical data
- Pre-allocate arrays when possible (np.zeros/np.empty)
- Use vectorized operations instead of loops
- Be mindful of dtype choices (int32 vs float64 etc)
- Avoid mixing data types in arrays
"""

In [None]:
# NumPy Array Creation Methods
import numpy as np

# ---------------------------
# 1. Basic Array Creation
# ---------------------------

# From Python lists
list_data = [6, 7.5, 8, 0, 1]
arr1d = np.array(list_data)
print("1D Array from list:\n", arr1d)
# Output: [6.  7.5 8.  0.  1. ]

# From nested lists (creates 2D array)
matrix_data = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2d = np.array(matrix_data)
print("\n2D Array from nested lists:\n", arr2d)
"""
Output:
[[1 2 3 4]
 [5 6 7 8]]
"""

# ---------------------------
# 2. Specialized Constructors
# ---------------------------

# Zeros array (float64 by default)
zeros_1d = np.zeros(5)
zeros_2d = np.zeros((3, 2))
print("\nZeros arrays:")
print("1D:", zeros_1d)
print("2D:\n", zeros_2d)

# Ones array with specific dtype
ones_int = np.ones((2, 3), dtype=np.int32)
print("\nOnes array with int32:\n", ones_int)

# Empty array (contains memory garbage!)
empty_arr = np.empty((2, 2))  # Uninitialized
print("\nEmpty array (values may vary):\n", empty_arr)

# Arange (array version of range)
range_arr = np.arange(10, 25, 3)  # Start, stop, step
print("\nArange array:", range_arr)  # [10 13 16 19 22]

# ---------------------------
# 3. Advanced Creation Methods
# ---------------------------

# Full array with fill value
full_arr = np.full((2, 3), 7)  # 2x3 filled with 7s
print("\nFull array:\n", full_arr)

# Identity matrices
eye_matrix = np.eye(3)       # 3x3 identity
identity_matrix = np.identity(4)  # 4x4 identity
print("\nIdentity matrices:")
print("3x3:\n", eye_matrix)
print("4x4:\n", identity_matrix)

# ---------------------------
# 4. Array Copying Methods
# ---------------------------

# Create template array
template = np.array([[1, 2], [3, 4]])

# Ones/zeros like template
ones_like = np.ones_like(template)
zeros_like = np.zeros_like(template)
print("\nArray cloning methods:")
print("Ones like template:\n", ones_like)
print("Zeros like template:\n", zeros_like)

# Asarray vs Array (memory comparison)
original = [1, 2, 3]
arr_copy = np.array(original)    # Creates copy
arr_view = np.asarray(original)  # May create view
print("\nMemory comparison:")
print("Array copy id:", id(arr_copy))
print("Asarray view id:", id(arr_view))

# ---------------------------
# 5. Data Type Management
# ---------------------------

# Type inference and control
mixed_types = np.array([1, 2.5, 3])  # Upcasts to float64
forced_type = np.array([1, 2, 3], dtype=np.float32)
print("\nData type examples:")
print("Inferred dtype:", mixed_types.dtype)   # float64
print("Forced dtype:", forced_type.dtype)    # float32

# ---------------------------
# 6. Specialized Arrays
# ---------------------------

# Linearly spaced arrays
linspace_arr = np.linspace(0, 100, 5)  # 5 values 0-100
print("\nLinspace array:", linspace_arr)  # [0. 25. 50. 75. 100.]

# Random arrays
random_arr = np.random.rand(2, 3)  # Uniform distribution
print("\nRandom array:\n", random_arr)

# ---------------------------
# Key Takeaways & Best Practices
# ---------------------------
"""
**Array Creation Guide:**
1. Use np.array() for converting existing data
2. Prefer np.zeros()/np.ones() for initialized arrays
3. Use np.empty() for uninitialized arrays (caution!)
4. np.arange() for numerical sequences
5. np.full() for constant-filled arrays
6. *_like functions for cloning shapes/dtypes

**Best Practices:**
- Always specify dtype when precision matters
- Use asarray() to avoid unnecessary copies
- Prefer linspace over arange for floating point ranges
- Initialize with zeros/ones unless performance critical
- Check .flags.owndata to verify array ownership

**Common dtypes:**
- np.int32: 32-bit integer
- np.float64: Double precision float (default)
- np.bool_: Boolean values
- np.complex128: Complex numbers
"""

# ---------------------------
# 7. Verification Methods
# ---------------------------
arr = np.array([[1, 2], [3, 4]])
print("\nArray verification:")
print("Dimensions:", arr.ndim)        # 2
print("Shape:", arr.shape)           # (2, 2)
print("Data type:", arr.dtype)       # int64
print("Total elements:", arr.size)    # 4
print("Memory usage:", arr.nbytes, "bytes")  # 32 bytes (4 elements * 8 bytes)

In [None]:
# NumPy Data Types (dtypes) Comprehensive Guide
import numpy as np

# =============================================================================
# 1. Basic dtype Operations
# =============================================================================
# Default dtype inference
arr_default_int = np.array([1, 2, 3])           # int64
arr_default_float = np.array([1.0, 2.5, 3.7])    # float64
arr_bool = np.array([True, False, True])         # bool

# Explicit dtype declaration
arr_int16 = np.array([1, 2, 3], dtype=np.int16)
arr_float32 = np.array([1, 2, 3], dtype=np.float32)
arr_uint8 = np.array([255, 0, 127], dtype=np.uint8)

print("\nBasic dtypes:")
print(f"Default int: {arr_default_int.dtype}")      # int64
print(f"Explicit float32: {arr_float32.dtype}")     # float32
print(f"Uint8 values: {arr_uint8}")                # [255   0 127]

# =============================================================================
# 2. Type Conversion & Casting
# =============================================================================
# Safe conversion
original_ints = np.array([1, 2, 3, 4], dtype=np.int32)
converted_floats = original_ints.astype(np.float64)

# Lossy conversion (truncation)
decimals = np.array([3.7, -1.2, 2.5])
truncated_ints = decimals.astype(np.int32)

# String conversion
numeric_strings = np.array(['1.25', '-9.6', '42'])
converted_floats = numeric_strings.astype(np.float64)

print("\nType Conversion Results:")
print(f"Int to float: {converted_floats}")        # [1.0, 2.0, 3.0, 4.0]
print(f"Float truncation: {truncated_ints}")      # [ 3 -1  2]
print(f"String conversion: {converted_floats}")   # [ 1.25 -9.6  42.  ]

# =============================================================================
# 3. dtype Best Practices & Pitfalls
# =============================================================================
# Memory optimization example
large_ints = np.ones(1000000, dtype=np.int64)  # 8MB
small_ints = large_ints.astype(np.int8)        # 1MB (but potential overflow!)

print(f"\nMemory savings: {large_ints.nbytes/1e6}MB -> {small_ints.nbytes/1e6}MB")

# Overflow demonstration
overflow_arr = np.array([250, 251, 252], dtype=np.uint8)
overflow_arr += 100  # Values wrap around (250+100=94)
print(f"Overflow example: {overflow_arr}")  # [94 95 96]

# String truncation example
names = np.array(['Alice', 'Bob', 'Charlie'], dtype='S3')
print(f"String truncation: {names}")  # [b'Ali' b'Bob' b'Cha']

# =============================================================================
# 4. Advanced dtype Usage
# =============================================================================
# Structured dtype for complex data
person_dtype = np.dtype([
    ('name', 'U32'),  # Unicode string (32 chars)
    ('age', 'i4'),    # 4-byte integer
    ('height', 'f4'), # 4-byte float
    ('active', '?')   # Boolean
])

people = np.array([
    ('Alice', 30, 1.65, True),
    ('Bob', 25, 1.80, False)
], dtype=person_dtype)

print("\nStructured dtype example:")
print(people[0])  # ('Alice', 30, 1.65, True)

# Memory-mapped arrays for large datasets
try:
    # Create/Write
    mmap_arr = np.memmap('temp.dat', dtype=np.float32, mode='w+', shape=(5,))
    mmap_arr[:] = np.random.randn(5)
    print("\nMemory-mapped array:", mmap_arr)
    
    # Read back
    mmap_read = np.memmap('temp.dat', dtype=np.float32, mode='r', shape=(5,))
    print("Read from disk:", mmap_read)
finally:
    del mmap_arr  # Flush to disk
    del mmap_read

# =============================================================================
# 5. Type Safety & Error Handling
# =============================================================================
# Safe conversion function
def safe_convert(arr, new_type):
    if np.issubdtype(arr.dtype, np.number) and np.issubdtype(new_type, np.number):
        return arr.astype(new_type)
    raise ValueError("Non-numeric conversion")

# Mixed data handling
mixed_data = np.array([1, "two", 3.0], dtype=np.object_)
print("\nMixed data handling:")
try:
    safe_convert(mixed_data, np.float64)
except ValueError as e:
    print(f"Conversion error: {e}")

# =============================================================================
# 6. Key Takeaways & When to Use
# =============================================================================
"""
**NumPy dtype Cheat Sheet:**

1. **Common dtypes:**
   - Integer: int8/16/32/64, uint8/16/32/64
   - Float: float16/32/64/128
   - Complex: complex64/128/256
   - Others: bool_, object_, string_, unicode_

2. **Best Practices:**
   - Use smallest dtype that fits data range
   - Prefer float32 for ML/DL, float64 for precision
   - Use astype() carefully (creates copies)
   - Handle strings with object dtype or dedicated string dtypes

3. **Pitfalls:**
   - Overflow in integer types
   - Precision loss in float conversions
   - String truncation in fixed-width dtypes
   - Accidental data copies with astype()

4. **Advanced Features:**
   - Structured dtypes for complex records
   - Memory mapping for large datasets
   - Custom dtype creation
"""

# Cleanup temporary file
import os
if os.path.exists('temp.dat'):
    os.remove('temp.dat')

In [None]:
# NumPy Boolean Arrays, Sorting, and Quantiles
import numpy as np

# =============================================
# 1. Boolean Array Methods
# =============================================
# Create random data and boolean mask
data = np.random.randn(100)
positive_mask = data > 0

print("Boolean array operations:")
print(f"Number of positive values: {positive_mask.sum()}")
print(f"Any positive values? {positive_mask.any()}")
print(f"All positive values? {positive_mask.all()}\n")

# Edge cases
all_true = np.array([True, True, True])
all_false = np.array([False, False, False])
print("All True array - any:", all_true.any(), "all:", all_true.all())
print("All False array - any:", all_false.any(), "all:", all_false.all())

# Non-boolean coercion
numbers = np.array([0, 1, -2, 3])
print("\nNon-boolean array evaluation:")
print("Any non-zero?", numbers.any())
print("All non-zero?", numbers.all())

# =============================================
# 2. Sorting Methods
# =============================================
# 1D sorting
arr = np.random.randn(6)
print("\nOriginal 1D array:", np.round(arr, 4))
arr.sort()
print("Sorted in-place:", np.round(arr, 4))

# 2D sorting
matrix = np.random.randn(5, 3)
print("\nOriginal 2D array:")
print(np.round(matrix, 4))

matrix.sort(1)  # Sort each row
print("\nSorted along rows (axis=1):")
print(np.round(matrix, 4))

matrix.sort(0)  # Sort each column
print("\nSorted along columns (axis=0):")
print(np.round(matrix, 4))

# Non-destructive sort
unsorted = np.random.randn(5)
sorted_copy = np.sort(unsorted)
print("\nOriginal array remains unchanged:", np.round(unsorted, 4))

# =============================================
# 3. Quantile Calculation
# =============================================
# Generate and sort large dataset
large_data = np.random.randn(1000)
large_data.sort()

# Calculate quantiles
percentiles = [5, 25, 50, 75, 95]
quantiles = [large_data[int(p/100 * len(large_data))] for p in percentiles]

print("\nQuantiles:")
for p, q in zip(percentiles, quantiles):
    print(f"{p}th percentile: {q:.4f}")

# =============================================
# 4. Performance Considerations
# =============================================
# Timing different sort methods
large_array = np.random.rand(1_000_000)

# In-place sort timing
%timeit large_array.sort()          # ~10ms

# Copy sort timing
%timeit np.sort(large_array)        # ~15ms 

# =============================================
# Key Takeaways
# =============================================
"""
NUMPY BOOLEAN & SORTING ESSENTIALS:

Boolean Arrays:
- True = 1, False = 0 in arithmetic operations
- sum() counts True values
- any() = ∃ True, all() = ∀ True
- Non-zero values evaluate as True

Sorting Methods:
1. arr.sort():
   - In-place modification
   - Returns None
   - Axis parameter for multidimensional arrays
   
2. np.sort(arr):
   - Returns sorted copy
   - Original array preserved
   - More memory intensive

Quantile Calculation:
1. Sort data
2. Access value at index = len(data) * percentile/100
3. More accurate methods exist (linear interpolation), 
   but sorted approach is simple

Best Practices:
- Use in-place sorting for memory efficiency
- Prefer boolean arrays over lists for masking
- Use np.quantile() for precise percentile calculations
- Reserve axis=0 for columns, axis=1 for rows

Common Pitfalls:
- Forgetting sort is in-place
- Mixing axis directions (0=columns, 1=rows)
- Assuming all() returns True for empty arrays (returns True!)
"""

# =============================================
# 5. Advanced: Structured Array Sorting
# =============================================
# Sort structured data by specific field
dtype = [('name', 'S10'), ('age', int), ('score', float)]
people = np.array([('Alice', 25, 88.5), ('Bob', 32, 94.0), ('Charlie', 28, 76.5)], dtype=dtype)
people.sort(order='age')
print("\nStructured array sorted by age:")
print(people)


In [3]:
import numpy as np

"""
NUMPY SET OPERATIONS DEMO
=========================
This executable script demonstrates various NumPy set operations."""

# Sample data arrays
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
values = np.array([6, 0, 0, 3, 2, 5, 6])
x = np.array([1, 2, 3, 4])
y = np.array([3, 4, 5, 6])

def main():
    # Demo 1: Basic unique values
    print("=== UNIQUE VALUES ===")
    print("Names:", np.unique(names))
    print("Integers:", np.unique(ints))
    print("Pure Python equivalent:", sorted(set(names)))
    
    # Demo 2: Membership testing
    print("\n=== MEMBERSHIP TESTING ===")
    mask = np.in1d(values, [2, 3, 6])
    print("Original array:", values)
    print("Membership mask:", mask)
    print("Filtered values:", values[mask])
    
    # Demo 3: Set operations
    print("\n=== SET OPERATIONS ===")
    print("Array x:", x)
    print("Array y:", y)
    print("Intersection:", np.intersect1d(x, y))
    print("Union:", np.union1d(x, y))
    print("Elements in x not in y:", np.setdiff1d(x, y))
    print("Symmetric difference:", np.setxor1d(x, y))
    
    # Demo 4: Performance comparison
    print("\n=== PERFORMANCE TIP ===")
    large_array = np.random.randint(0, 1000, 10000)
    print("Calculating unique values for 10,000 elements...")
    _ = np.unique(large_array)
    print("NumPy unique() completed instantly!")

"""
KEY TAKEAWAYS 💡
================
🌟 np.unique() faster than sorted(set()) for large data
🌟 Set functions work with 1D arrays only
🌟 in1d() creates boolean masks for filtering
🌟 Results always sorted in NumPy
🌟 Great for data cleaning/preprocessing"""

if __name__ == "__main__":
    main()

=== UNIQUE VALUES ===
Names: ['Bob' 'Joe' 'Will']
Integers: [1 2 3 4]
Pure Python equivalent: [np.str_('Bob'), np.str_('Joe'), np.str_('Will')]

=== MEMBERSHIP TESTING ===
Original array: [6 0 0 3 2 5 6]
Membership mask: [ True False False  True  True False  True]
Filtered values: [6 3 2 6]

=== SET OPERATIONS ===
Array x: [1 2 3 4]
Array y: [3 4 5 6]
Intersection: [3 4]
Union: [1 2 3 4 5 6]
Elements in x not in y: [1 2]
Symmetric difference: [1 2 5 6]

=== PERFORMANCE TIP ===
Calculating unique values for 10,000 elements...
NumPy unique() completed instantly!


  mask = np.in1d(values, [2, 3, 6])


In [4]:
"""
NUMPY FILE I/O & LINEAR ALGEBRA GUIDE
=====================================
This script demonstrates NumPy's file operations and linear algebra capabilities."""

import numpy as np
from numpy.linalg import inv, qr, det, eig

# ---------------------------
# 4.4 File Input/Output Demo
# ---------------------------

def file_io_demo():
    # Single array save/load
    arr = np.arange(10)
    np.save('single_array.npy', arr)
    loaded_arr = np.load('single_array.npy')
    
    # Multiple arrays archive
    np.savez('multi_arrays.npz', arr1=arr, arr2=arr*2)
    archive = np.load('multi_arrays.npz')
    
    # Compressed archive
    np.savez_compressed('compressed_arrays.npz', big_array=np.random.randn(1000, 1000))
    
    return {
        'loaded_single': loaded_arr,
        'archive_contents': list(archive.keys()),
        'compressed_size': len(archive['arr1'])
    }

# ---------------------------
# 4.5 Linear Algebra Demo
# ---------------------------

def linear_algebra_demo():
    # Matrix creation
    A = np.array([[1, 2], [3, 4]])
    B = np.array([[5, 6], [7, 8]])
    
    # Matrix multiplication
    dot_product = A.dot(B)
    at_operator = A @ B
    
    # Matrix operations
    matrix_inv = inv(A)
    q, r = qr(A)
    eigenvalues, eigenvectors = eig(A)
    
    return {
        'dot_product': dot_product,
        'matrix_inverse': matrix_inv,
        'qr_r': r,
        'eigenvalues': eigenvalues
    }

# ---------------------------
# Key Takeaways & Help Table
# ---------------------------

LIN_ALG_FUNCTIONS = """
NUMPY.LINALG FUNCTION CHEATSHEET
--------------------------------
| Function | Description                      | Example Use Case           |
|----------|----------------------------------|----------------------------|
| inv()    | Matrix inverse                   | Solving linear equations   |
| qr()     | QR decomposition                | Matrix factorization       |
| det()    | Determinant                     | Matrix invertibility check |
| eig()    | Eigenvalues/vectors             | Spectral analysis          |
| svd()    | Singular Value Decomposition    | Dimensionality reduction   |
| solve()  | Solve linear system Ax = b      | Optimization problems      |"""

KEY_TAKEAWAYS = """
KEY TAKEAWAYS 🔑
================
📁 File I/O:
- Use .npy for single arrays, .npz for archives
- savez_compressed reduces file size dramatically
- 10-100x faster than text formats for large data

🧮 Linear Algebra:
- @ operator preferred for matrix multiplication
- Built on optimized BLAS/LAPACK implementations
- det() helps check matrix invertibility
- SVD/Eigendecomposition crucial for ML algorithms

💡 Pro Tips:
- Use np.memmap for memory-mapped large arrays
- pinv() handles non-square matrices
- lstsq() for regression problems"""

# ---------------------------
# Main Execution
# ---------------------------

if __name__ == "__main__":
    print("🔷 File I/O Results 🔷")
    file_results = file_io_demo()
    print(f"Loaded array: {file_results['loaded_single']}")
    print(f"Archive contents: {file_results['archive_contents']}")
    
    print("\n🔷 Linear Algebra Results 🔷")
    lin_alg_results = linear_algebra_demo()
    print("Matrix product:\n", lin_alg_results['dot_product'])
    print("\nQR R matrix:\n", lin_alg_results['qr_r'])
    print("\nEigenvalues:", lin_alg_results['eigenvalues'])
    
    print(LIN_ALG_FUNCTIONS)
    print(KEY_TAKEAWAYS)

🔷 File I/O Results 🔷
Loaded array: [0 1 2 3 4 5 6 7 8 9]
Archive contents: ['arr1', 'arr2']

🔷 Linear Algebra Results 🔷
Matrix product:
 [[19 22]
 [43 50]]

QR R matrix:
 [[-3.16227766 -4.42718872]
 [ 0.         -0.63245553]]

Eigenvalues: [-0.37228132  5.37228132]

NUMPY.LINALG FUNCTION CHEATSHEET
--------------------------------
| Function | Description                      | Example Use Case           |
|----------|----------------------------------|----------------------------|
| inv()    | Matrix inverse                   | Solving linear equations   |
| qr()     | QR decomposition                | Matrix factorization       |
| det()    | Determinant                     | Matrix invertibility check |
| eig()    | Eigenvalues/vectors             | Spectral analysis          |
| svd()    | Singular Value Decomposition    | Dimensionality reduction   |
| solve()  | Solve linear system Ax = b      | Optimization problems      |

KEY TAKEAWAYS 🔑
📁 File I/O:
- Use .npy for single array

In [None]:
"""
NUMPY RANDOM & RANDOM WALKS DEMO
==================================
Demonstrates pseudorandom number generation and random walk simulations."""

import numpy as np
import matplotlib.pyplot as plt
from timeit import timeit

# ---------------------------
# 4.6 Pseudorandom Number Generation
# ---------------------------

def random_demo():
    # Seed for reproducibility
    np.random.seed(123)
    
    # Generate random samples
    samples = np.random.normal(size=(4, 4))
    rng = np.random.RandomState(456)
    custom_samples = rng.randn(5)
    
    # Performance comparison
    py_time = timeit('[np.random.normal() for _ in range(1000)]', number=1000)
    np_time = timeit('np.random.normal(size=1000)', globals=globals(), number=1000)
    
    return {
        'seeded_samples': samples,
        'custom_rng_samples': custom_samples,
        'py_vs_np_times': (py_time, np_time)
    }

# ---------------------------
# 4.7 Random Walk Simulation
# ---------------------------

def random_walk_demo():
    # Single walk
    nsteps = 1000
    steps = np.where(np.random.randint(0, 2, nsteps), 1, -1)
    walk = steps.cumsum()
    
    # Multiple walks
    nwalks = 5000
    draws = np.random.randint(0, 2, (nwalks, nsteps))
    walks = np.where(draws, 1, -1).cumsum(axis=1)
    
    # Analysis
    hits30 = (np.abs(walks) >= 30).any(axis=1)
    crossing_times = (np.abs(walks[hits30]) >= 30).argmax(axis=1)
    
    return {
        'walk_plot_data': walk[:100],
        'max_walk': walks.max(),
        'min_walk': walks.min(),
        'crossing_time_avg': crossing_times.mean()
    }

# ---------------------------
# Visualization & Tables
# ---------------------------

RANDOM_FUNCTIONS = """
NUMPY.RANDOM FUNCTION CHEATSHEET
---------------------------------
| Function      | Description                      | Example Use               |
|---------------|----------------------------------|---------------------------|
| seed()        | Set global random seed          | np.random.seed(123)       |
| RandomState() | Create isolated RNG             | rng = np.random.RandomState()|
| rand()        | Uniform distribution [0,1)      | np.random.rand(2,3)       |
| randn()       | Standard normal distribution    | np.random.randn(100)      |
| randint()     | Random integers                 | np.random.randint(0,10,5) |
| normal()      | Normal distribution             | np.random.normal(mean, std, size)|
| binomial()    | Binomial distribution           | np.random.binomial(n,p,size) |"""

KEY_TAKEAWAYS = """
KEY INSIGHTS 🔍
===============
🎲 Random Generation:
- NumPy is 10-100x faster than pure Python for large samples
- Use RandomState for reproducible, isolated streams
- Seed management is crucial for reproducible results

🚶 Random Walks:
- Vectorized operations enable efficient simulations
- cumsum() is powerful for path calculations
- argmax() finds first occurrence efficiently
- Any()/All() help analyze multidimensional results

📊 Statistical Analysis:
- 68-95-99.7 rule applies to normal distributions
- Crossing times help understand walk behavior
- Multiple walks enable probabilistic forecasting"""

# ---------------------------
# Main Execution
# ---------------------------

if __name__ == "__main__":
    # Generate data
    random_data = random_demo()
    walk_data = random_walk_demo()
    
    # Print results
    print("🔶 Random Number Generation Results 🔶")
    print("Seeded normal samples:\n", random_data['seeded_samples'])
    print(f"\nPython vs NumPy times: {random_data['py_vs_np_times'][0]:.2f}s vs {random_data['py_vs_np_times'][1]:.2f}s")
    
    print("\n🔶 Random Walk Analysis 🔶")
    print(f"Max walk position: {walk_data['max_walk']}")
    print(f"Min walk position: {walk_data['min_walk']}")
    print(f"Average crossing time: {walk_data['crossing_time_avg']:.1f} steps")
    
    # Plot first 100 steps
    plt.figure(figsize=(10, 4))
    plt.plot(walk_data['walk_plot_data'])
    plt.title("First 100 Steps of Random Walk")
    plt.xlabel("Step")
    plt.ylabel("Position")
    plt.show()
    
    print(RANDOM_FUNCTIONS)
    print(KEY_TAKEAWAYS)

CHAPTER 5 - PANDAS


In [None]:
"""
PANDAS SERIES ESSENTIALS
=========================
A comprehensive guide to pandas Series with practical examples."""

import pandas as pd
import numpy as np

# Create basic Series with default index
basic_series = pd.Series([4, 7, -5, 3])
print("🔷 Basic Series:\n", basic_series)

# Series with custom index
indexed_series = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print("\n🔷 Custom Index Series:\n", indexed_series)

# Accessing elements
print("\n🔷 Element Access:")
print("Value at 'a':", indexed_series['a'])
indexed_series['d'] = 6  # Modify value
print("Modified Series:\n", indexed_series[['c', 'a', 'd']])

# Series from dictionary
state_data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
states_series = pd.Series(state_data)
print("\n🔷 From Dictionary:\n", states_series)

# Handling missing data
states_list = ['California', 'Ohio', 'Oregon', 'Texas']
missing_series = pd.Series(state_data, index=states_list)
print("\n🔷 Series with Missing Data:\n", missing_series)
print("\nNull Check:\n", missing_series.isnull())

# Vector operations
print("\n🔷 Vector Operations:")
print("Filtered Series:\n", indexed_series[indexed_series > 0])
print("Scalar Multiplication:\n", indexed_series * 2)
print("Exponential Transformation:\n", np.exp(indexed_series))

# Index alignment
combined_series = states_series + missing_series
print("\n🔷 Aligned Addition:\n", combined_series)

# Naming conventions
missing_series.name = "State Population"
missing_series.index.name = "US States"
print("\n🔷 Named Series:\n", missing_series)

# Index modification
basic_series.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print("\n🔷 Reindexed Series:\n", basic_series)

# Cheatsheet Table
SERIES_FUNCTIONS = """
📋 SERIES OPERATIONS CHEATSHEET
--------------------------------
| Method            | Description                 | Example                   |
|-------------------|-----------------------------|---------------------------|
| pd.Series()       | Create new Series           | pd.Series(data, index)    |
| .values           | Get NumPy array             | series.values             |
| .index            | Get index object            | series.index              |
| .isnull()         | Detect missing values       | series.isnull()           |
| .notnull()        | Detect non-missing values   | series.notnull()          |
| .name             | Series name attribute       | series.name = "Revenue"   |
| .index.name       | Index name attribute        | series.index.name = "City"|
| [label]           | Label-based indexing        | series['London']          |
| .loc[]            | Explicit label-based access | series.loc['Paris']       |"""

KEY_TAKEAWAYS = """
🌟 KEY INSIGHTS 🌟
------------------
1. Index Flexibility: Series can have any hashable type as index (strings, dates, etc)
2. Data Alignment: Automatic index matching in operations (like SQL JOIN)
3. Missing Data: NaN represents missing values - use isnull()/notnull() to detect
4. Vectorization: Operate on entire series without loops (NumPy-style)
5. Hybrid Nature: Combines dictionary-like access with array operations
6. Size Immutability: Can't change size, but can modify values and index
7. Name Metadata: Add contextual information with name attributes

💡 Pro Tips:
- Convert between Series and dict with to_dict()/from_dict()
- Use .copy() when creating modified copies to preserve original data
- Prefer .loc[] for explicit label-based indexing
- Handle missing data early with .dropna() or .fillna()"""

if __name__ == "__main__":
    print(SERIES_FUNCTIONS)
    print(KEY_TAKEAWAYS)

In [1]:
"""
PANDAS DATAFRAME ESSENTIALS
============================
A comprehensive guide to pandas DataFrame with practical examples."""

import pandas as pd
import numpy as np

# ---------------------------
# DataFrame Creation
# ---------------------------

def create_dataframes():
    # From dictionary of lists
    data = {
        'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
    }
    df1 = pd.DataFrame(data)
    
    # With specified column order and index
    df2 = pd.DataFrame(data, 
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
    
    # From nested dictionary
    pop_data = {
        'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
    }
    df3 = pd.DataFrame(pop_data)
    
    return df1, df2, df3

# ---------------------------
# Column Operations
# ---------------------------

def column_operations(df):
    # Add new column
    df['debt'] = np.arange(6.)
    
    # Modify with Series (alignment)
    val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
    df['debt'] = val
    
    # Create boolean column
    df['eastern'] = df.state == 'Ohio'
    
    # Delete column
    del df['eastern']
    
    return df

# ---------------------------
# DataFrame Properties
# ---------------------------

def dataframe_properties(df):
    # Access columns
    year_col = df.year        # Attribute-style
    state_col = df['state']   # Dict-style
    
    # Row access
    row = df.loc['three']
    
    # Metadata
    df.index.name = 'entry'
    df.columns.name = 'info'
    
    # Values array
    values = df.values
    
    return {
        'year_col': year_col.head(),
        'row_data': row,
        'values_sample': values[:2]
    }

# ---------------------------
# Cheatsheet & Takeaways
# ---------------------------

DATAFRAME_CHEATSHEET = """
📋 DATAFRAME CREATION CHEATSHEET
---------------------------------
| Source                 | Example                          |
|------------------------|----------------------------------|
| Dict of lists          | pd.DataFrame({'col': [1,2,3]})   |
| List of dicts          | pd.DataFrame([{'a':1}, {'a':2}])|
| Nested dictionary      | pd.DataFrame({'A': {1: 'a'}})   |
| 2D NumPy array         | pd.DataFrame(np.array([[1,2]])) |
| CSV file               | pd.read_csv('data.csv')          |
| Excel file             | pd.read_excel('data.xlsx')       |

🔧 KEY OPERATIONS:
- Add column: df['new'] = values
- Delete column: del df['col']
- Access column: df.col or df['col']
- Access row: df.loc[index]
- Transpose: df.T
- Head/Tail: df.head(3), df.tail(2)
"""

KEY_TAKEAWAYS = """
🌟 ESSENTIAL INSIGHTS 🌟
1. Heterogeneous Data: Columns can have different data types
2. Flexible Indexing: Both row and column labels support complex operations
3. Alignment: Operations automatically align data by index/columns
4. Missing Data: NaN represents missing values (handle with dropna/fillna)
5. Column Types: Access via attribute (valid names) or dict-style (any names)
6. Performance: Underlying NumPy arrays enable vectorized operations

💡 PRO TIPS:
- Use .copy() when creating DataFrame copies to avoid view vs copy issues
- Prefer .loc[] for explicit label-based indexing
- Set index/column names for better visualizations and merges
- Use .assign() for chainable column creation
"""

# ---------------------------
# Main Execution
# ---------------------------

if __name__ == "__main__":
    # Create and demonstrate DataFrames
    df1, df2, df3 = create_dataframes()
    print("🔷 Basic DataFrame:")
    print(df1.head())
    
    print("\n🔷 DataFrame with Custom Index:")
    print(df2)
    
    # Column operations demo
    modified_df = column_operations(df2.copy())
    print("\n🔷 Modified DataFrame:")
    print(modified_df)
    
    # Properties demonstration
    props = dataframe_properties(df2.copy())
    print("\n🔷 DataFrame Properties:")
    print("Year column sample:\n", props['year_col'])
    print("\nRow 'three' data:\n", props['row_data'])
    print("\nValues array sample:\n", props['values_sample'])
    
    # Show transposed nested dict DataFrame
    print("\n🔷 Transposed Nested Dict DataFrame:")
    print(df3.T)
    
    # Display cheatsheet and takeaways
    print(DATAFRAME_CHEATSHEET)
    print(KEY_TAKEAWAYS)

🔷 Basic DataFrame:
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9

🔷 DataFrame with Custom Index:
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN

🔷 Modified DataFrame:
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN

🔷 DataFrame Properties:
Year column sample:
 entry
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

Row 'three' data:
 info
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

Values array sample:
 [[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 nan]]

🔷 Transposed Nested Dict DataFrame:
        200

In [2]:
import pandas as pd
import numpy as np

# --- Sorting Examples ---

# Sorting Series by index
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
print("Sorted Series by index:\n", obj.sort_index())

# Sorting DataFrame by index (rows)
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
print("\nDataFrame sorted by row index:\n", frame.sort_index())

# Sorting DataFrame columns
print("\nDataFrame sorted by column index:\n", frame.sort_index(axis=1))

# Sorting Series by values (handles NaN)
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print("\nSeries sorted by values (NaN at end):\n", obj.sort_values())

# Sorting DataFrame by column values
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print("\nDataFrame sorted by column 'b':\n", frame.sort_values(by='b'))
print("\nDataFrame sorted by multiple columns:\n", frame.sort_values(by=['a', 'b']))

# --- Ranking Examples ---

# Default ranking (average for ties)
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print("\nDefault ranking (average ties):\n", obj.rank())

# Ranking by first occurrence
print("\nRanking by first occurrence:\n", obj.rank(method='first'))

# Descending ranking with max method
print("\nDescending ranking (max method):\n", obj.rank(ascending=False, method='max'))

# Ranking across DataFrame columns
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 
                     'a': [0, 1, 0, 1],
                     'c': [-2, 5, 8, -2.5]})
print("\nDataFrame ranked across columns:\n", frame.rank(axis='columns'))

"""
Key Takeaways:
• sort_index() sorts by index labels (ascending by default)
• sort_values() sorts by data values, handles NaN by sending to end
• DataFrame sorting: use `by` parameter for column-based sorting
• rank() handles ties with methods: 'average' (default), 'min', 'max', 'first', 'dense'
• Ranking can be applied to both Series and DataFrame axes
• ascending=False reverses sort/rank order
• axis=1 parameter operates on columns instead of rows
"""

Sorted Series by index:
 a    1
b    2
c    3
d    0
dtype: int64

DataFrame sorted by row index:
        d  a  b  c
one    4  5  6  7
three  0  1  2  3

DataFrame sorted by column index:
        a  b  c  d
three  1  2  3  0
one    5  6  7  4

Series sorted by values (NaN at end):
 4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

DataFrame sorted by column 'b':
    b  a
2 -3  0
3  2  1
0  4  0
1  7  1

DataFrame sorted by multiple columns:
    b  a
2 -3  0
0  4  0
3  2  1
1  7  1

Default ranking (average ties):
 0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranking by first occurrence:
 0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Descending ranking (max method):
 0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

DataFrame ranked across columns:
      b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


"\nKey Takeaways:\n• sort_index() sorts by index labels (ascending by default)\n• sort_values() sorts by data values, handles NaN by sending to end\n• DataFrame sorting: use `by` parameter for column-based sorting\n• rank() handles ties with methods: 'average' (default), 'min', 'max', 'first', 'dense'\n• Ranking can be applied to both Series and DataFrame axes\n• ascending=False reverses sort/rank order\n• axis=1 parameter operates on columns instead of rows\n"

In [None]:
import pandas as pd
import numpy as np

# --- Series with Duplicate Indices ---
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print("Series with duplicate indices:\n", obj)

# Check index uniqueness
print("\nIs index unique?", obj.index.is_unique)  # False

# Data selection behavior
print("\nSelecting 'a' (multiple entries):\n", obj['a'])  # Returns Series
print("\nSelecting 'c' (single entry):\n", obj['c'])      # Returns scalar

# --- DataFrame with Duplicate Row Indices ---
df = pd.DataFrame(np.random.randn(4, 3), 
                  index=['a', 'a', 'b', 'b'],
                  columns=['X', 'Y', 'Z'])
print("\nDataFrame with duplicate row indices:\n", df)

# Indexing with duplicate labels
print("\nSelecting 'b' rows:\n", df.loc['b'])  # Returns DataFrame

"""
Key Takeaways:
• Pandas allows duplicate labels in indices (is_unique = False)
• Indexing behavior changes with duplicates:
   - Multiple entries return Series/DataFrame
   - Single entries return scalar value
• This affects:
   - Data selection (loc/iloc)
   - Aggregation operations
   - Merge/join operations
• Use index.duplicated() to identify duplicates
• Consider using reset_index() for unique indices when needed
"""

In [3]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import datetime

# --- Example Setup (using mock data if Yahoo API unavailable) ---
try:
    # Attempt to fetch real stock data (may fail due to API changes)
    all_data = {
        ticker: web.get_data_yahoo(ticker, start=datetime.datetime(2020, 1, 1))
        for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']
    }
    price = pd.DataFrame({ticker: data['Adj Close'] 
                         for ticker, data in all_data.items()})
except:
    # Fallback mock data if API fails
    dates = pd.date_range('20200101', periods=5)
    price = pd.DataFrame({
        'AAPL': [100, 101, 102, 103, 104],
        'IBM': [150, 152, 151, 153, 155],
        'MSFT': [200, 202, 201, 203, 205],
        'GOOG': [1000, 1005, 1003, 1008, 1010]
    }, index=dates)

# Calculate percent changes
returns = price.pct_change()

# --- Correlation and Covariance Calculations ---
# Correlation between two Series
msft_ibm_corr = returns['MSFT'].corr(returns['IBM'])
print(f"MSFT-IBM Correlation: {msft_ibm_corr:.4f}")

# Covariance between two Series
msft_ibm_cov = returns['MSFT'].cov(returns['IBM'])
print(f"MSFT-IBM Covariance: {msft_ibm_cov:.6f}")

# Full correlation matrix
corr_matrix = returns.corr()
print("\nCorrelation Matrix:\n", corr_matrix)

# Full covariance matrix
cov_matrix = returns.cov()
print("\nCovariance Matrix:\n", cov_matrix)

# Pairwise correlation with a Series
corr_with_ibm = returns.corrwith(returns['IBM'])
print("\nCorrelation with IBM:\n", corr_with_ibm)

# Pairwise correlation with another DataFrame (example with shifted returns)
shifted = returns.shift(1)
corr_with_shifted = returns.corrwith(shifted)
print("\nCorrelation with Shifted Returns:\n", corr_with_shifted)

"""
Key Takeaways:
• corr() computes Pearson correlation coefficient between Series/DataFrame columns
• cov() computes covariance between data points
• DataFrame.corr()/cov() return full matrices showing pairwise relationships
• corrwith() enables:
   - Column-wise correlation with a Series
   - Row-wise correlation with axis='rows'
   - Cross-correlation between two DataFrames
• Automatic handling of:
   - Missing values (aligned by index)
   - Non-overlapping indices (excluded from calculations)
• pct_change() is commonly used to calculate returns in financial time series
• Correlation values range [-1, 1], measuring linear relationships
• Covariance values depend on data scale (use correlation for standardized comparison)
"""

MSFT-IBM Correlation: 1.0000
MSFT-IBM Covariance: 0.000074

Correlation Matrix:
           AAPL       IBM      MSFT      GOOG
AAPL  1.000000 -0.243195 -0.245647  0.083972
IBM  -0.243195  1.000000  0.999996  0.907900
MSFT -0.245647  0.999996  1.000000  0.906824
GOOG  0.083972  0.907900  0.906824  1.000000

Covariance Matrix:
               AAPL           IBM          MSFT          GOOG
AAPL  1.571054e-08 -3.017291e-07 -2.291559e-07  3.482984e-08
IBM  -3.017291e-07  9.797924e-05  7.366969e-05  2.973894e-05
MSFT -2.291559e-07  7.366969e-05  5.539197e-05  2.233400e-05
GOOG  3.482984e-08  2.973894e-05  2.233400e-05  1.095067e-05

Correlation with IBM:
 AAPL   -0.243195
IBM     1.000000
MSFT    0.999996
GOOG    0.907900
dtype: float64

Correlation with Shifted Returns:
 AAPL    1.000000
IBM    -0.509879
MSFT   -0.507432
GOOG   -0.824131
dtype: float64


"\nKey Takeaways:\n• corr() computes Pearson correlation coefficient between Series/DataFrame columns\n• cov() computes covariance between data points\n• DataFrame.corr()/cov() return full matrices showing pairwise relationships\n• corrwith() enables:\n   - Column-wise correlation with a Series\n   - Row-wise correlation with axis='rows'\n   - Cross-correlation between two DataFrames\n• Automatic handling of:\n   - Missing values (aligned by index)\n   - Non-overlapping indices (excluded from calculations)\n• pct_change() is commonly used to calculate returns in financial time series\n• Correlation values range [-1, 1], measuring linear relationships\n• Covariance values depend on data scale (use correlation for standardized comparison)\n"

In [None]:
import pandas as pd

# --- Unique Values and Value Counts ---
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

# Extract unique values (order preserved)
uniques = obj.unique()
print("Unique values:\n", uniques)

# Frequency counts (sorted descending by default)
print("\nValue counts:\n", obj.value_counts())

# Top-level value_counts with sort=False
print("\nUnsorted value counts:\n", pd.value_counts(obj.values, sort=False))

# --- Set Membership Checks ---
# Filter using isin()
mask = obj.isin(['b', 'c'])
print("\nFiltered Series:\n", obj[mask])

# --- Index Alignment with get_indexer ---
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
indices = pd.Index(unique_vals).get_indexer(to_match)
print("\nIndex alignment:\n", indices)

# --- DataFrame Value Counts ---
data = pd.DataFrame({
    'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]
})

# Apply value_counts across columns
result = data.apply(pd.value_counts).fillna(0)
print("\nDataFrame value counts:\n", result)

"""
Key Takeaways:
• unique() preserves insertion order of first occurrence
• value_counts() returns sorted frequency counts (descending)
• isin() enables vectorized membership filtering
• get_indexer() maps values to positions in a unique list
• DataFrame.apply(value_counts) creates cross-column frequency tables
• These methods handle:
   - Categorical data analysis
   - Data alignment operations
   - Frequency-based feature engineering
• Missing values are excluded by default in counts
• Use sort=False to preserve original value order
"""

CHAPTER 6: Data Loading, Storage, and File Formats

In [None]:
import pandas as pd

# --- Basic CSV Reading ---
# Read CSV with header
df = pd.read_csv('examples/ex1.csv')
print("Basic CSV read:\n", df)

# Equivalent using read_table with explicit separator
df = pd.read_table('examples/ex1.csv', sep=',')
print("\nUsing read_table:\n", df)

# --- Handling No Headers ---
# Read CSV without header, auto-generate column names
df_no_header = pd.read_csv('examples/ex2.csv', header=None)
print("\nNo header CSV:\n", df_no_header)

# Specify custom column names
names = ['a', 'b', 'c', 'd', 'message']
df_custom_names = pd.read_csv('examples/ex2.csv', names=names)
print("\nCustom column names:\n", df_custom_names)

# Set index column during import
df_indexed = pd.read_csv('examples/ex2.csv', names=names, index_col='message')
print("\nMessage as index:\n", df_indexed)

# --- Hierarchical Indexing ---
# Create multi-level index from multiple columns
df_mindex = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])
print("\nHierarchical index:\n", df_mindex)

# --- Custom Delimiters ---
# Read whitespace-separated file using regex
df_whitespace = pd.read_table('examples/ex3.txt', sep='\s+')
print("\nWhitespace-delimited data:\n", df_whitespace)

# --- Skipping Rows ---
# Skip specific rows during import
df_skipped = pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])
print("\nAfter skipping rows:\n", df_skipped)

# --- Handling Missing Data ---
# Default NA value handling
df_missing = pd.read_csv('examples/ex5.csv')
print("\nDefault missing values:\n", df_missing.isnull())

# Custom NA sentinels
df_custom_na = pd.read_csv('examples/ex5.csv', na_values=['NULL', 'foo'])
print("\nCustom NA values:\n", df_custom_na)

# Column-specific NA values
sentinels = {'message': ['NA'], 'something': ['two']}
df_col_na = pd.read_csv('examples/ex5.csv', na_values=sentinels)
print("\nColumn-specific NA handling:\n", df_col_na)

"""
Key Takeaways:
• Primary text loading functions: read_csv() and read_table()
• Core parameters:
   - sep/delimiter for field separation
   - header for column name handling
   - index_col for index specification
   - skiprows to omit specific lines
   - na_values for missing data customization
• Automatic type inference for columns
• Support for hierarchical indexing via multi-column keys
• Flexible missing value handling with per-column sentinels
• Ability to parse whitespace and regex-delimited formats
• Built-in support for:
   - Comment handling
   - Date parsing
   - Thousands separators
• Use verbose=True for parsing diagnostics
• Consider chunksize for large files
"""

In [None]:
import pandas as pd
import numpy as np
import sys

# --- Reading Large Files in Chunks ---
# Set display options for readability
pd.options.display.max_rows = 10

# Read first 5 rows of large file
small_chunk = pd.read_csv('examples/ex6.csv', nrows=5)
print("First 5 rows:\n", small_chunk)

# Process file in 1,000-row chunks
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
key_counts = pd.Series(dtype=float)

for chunk in chunker:
    key_counts = key_counts.add(chunk['key'].value_counts(), fill_value=0)

key_counts = key_counts.sort_values(ascending=False)
print("\nTop 10 key frequencies:\n", key_counts[:10])

# --- Writing Data to Text Files ---
# Sample data from earlier example
data = pd.read_csv('examples/ex5.csv')

# Basic CSV export
data.to_csv('examples/out.csv')  # Writes to file
print("\nDefault CSV output:")
data.to_csv(sys.stdout)  # Print to console

# Custom delimiter and missing value handling
print("\nPipe-delimited with NULLs:")
data.to_csv(sys.stdout, sep='|', na_rep='NULL')

# Exclude index and headers
print("\nNo index or headers:")
data.to_csv(sys.stdout, index=False, header=False)

# Select specific columns
print("\nSelected columns output:")
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

# --- Time Series Export ---
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
print("\nTime series CSV:")
with open('examples/tseries.csv') as f:
    print(f.read())

In [None]:
import csv
from io import StringIO

# --- Manual CSV Processing Example ---
# Simulated CSV content (replace with real file path for actual use)
csv_content = '''"a","b","c"
"1","2","3"
"1","2","3"'''

# Using StringIO to simulate file handling
with StringIO(csv_content) as f:
    reader = csv.reader(f)
    lines = list(reader)

header, values = lines[0], lines[1:]
data_dict = {h: tuple(v) for h, v in zip(header, zip(*values))}

print("Data Dictionary:\n", data_dict)

# --- Custom CSV Dialect ---
class MyDialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
    skipinitialspace = True

# Writing CSV with custom dialect
output = StringIO()
writer = csv.writer(output, dialect=MyDialect)
writer.writerow(('one', 'two', 'three'))
writer.writerows([
    ('1', '2', '3'),
    ('4', '5', '6'),
    ('7', '8', '9')
])

print("\nCSV Output with Custom Dialect:\n", output.getvalue())

# --- Handling Different Delimiters ---
# Read semicolon-separated values
ssv_content = """col1;col2;col3
value1;value2;value3"""

with StringIO(ssv_content) as f:
    reader = csv.reader(f, delimiter=';')
    print("\nSemicolon-delimited data:\n", list(reader))

In [None]:
import pandas as pd

# --- Create Sample DataFrame ---
data = {
    'a': [1, 5, 9],
    'b': [2, 6, 10],
    'c': [3, 7, 11],
    'd': [4, 8, 12],
    'message': ['hello', 'world', 'foo']
}
frame = pd.DataFrame(data)

# --- Pickle Serialization ---
# Save to pickle format
frame.to_pickle('frame_pickle')

# Load from pickle
loaded_frame = pd.read_pickle('frame_pickle')

# Verify data integrity
print("Original DataFrame:")
print(frame)
print("\nLoaded DataFrame:")
print(loaded_frame)
print("DataFrames equal:", frame.equals(loaded_frame))

# --- HDF5 Example (requires pytables) ---
try:
    frame.to_hdf('frame.h5', 'data', format='table')
    hdf_frame = pd.read_hdf('frame.h5', 'data')
    print("\nHDF5 load successful")
except ImportError:
    print("\nHDF5 support requires pytables installation")

# --- Feather Format (requires feather-format) ---
try:
    frame.to_feather('frame.feather')
    feather_frame = pd.read_feather('frame.feather')
    print("Feather load successful")
except ImportError:
    print("Feather support requires feather-format installation")

"""
Key Takeaways:
• Pickle format:
  - Python-specific binary serialization
  - Fast and convenient for temporary storage
  - Not recommended for long-term storage (version compatibility risks)
  
• HDF5:
  - Hierarchical Data Format for large datasets
  - Supports on-disk storage and partial reads
  - Requires pytables package

• Feather:
  - Cross-language (Python/R) columnar format
  - Uses Apache Arrow for efficient storage
  - Requires feather-format package

• Other formats:
  - MessagePack (binary JSON-like)
  - bcolz (compressed columnar storage)
  - Parquet (via pyarrow)

• Best practices:
  - Use pickle for short-term caching
  - Prefer HDF5/Feather/Parquet for production workflows
  - Always verify data after deserialization
  - Consider compression options for large datasets
"""

In [None]:
import pandas as pd

# --- Sample Data Preparation ---
data = {
    'a': [1, 5, 9],
    'b': [2, 6, 10],
    'c': [3, 7, 11],
    'd': [4, 8, 12],
    'message': ['hello', 'world', 'foo']
}
frame = pd.DataFrame(data)

# --- Writing to Excel ---
# Write to .xlsx file (requires openpyxl)
try:
    frame.to_excel('examples/ex2.xlsx', sheet_name='Sheet1', index=False)
    print("Excel file written successfully")
except Exception as e:
    print(f"Error writing Excel: {e}")

# --- Reading from Excel ---
# Method 1: Using ExcelFile for multiple sheets
try:
    xlsx = pd.ExcelFile('examples/ex1.xlsx')
    df1 = xlsx.parse('Sheet1')
    print("\nData from ExcelFile method:\n", df1.head())
except FileNotFoundError:
    print("File not found - ensure xlrd is installed for .xls files")

# Method 2: Direct read_excel (for single sheet)
try:
    df2 = pd.read_excel('examples/ex1.xlsx', sheet_name='Sheet1')
    print("\nData from read_excel:\n", df2.head())
except Exception as e:
    print(f"Error reading Excel: {e}")

# --- Handling Multiple Sheets ---
# Read all sheets
with pd.ExcelFile('examples/ex1.xlsx') as xls:
    sheets = {sh: xls.parse(sh) for sh in xls.sheet_names}
    print("\nSheet names:", xls.sheet_names)

# --- Writing Multiple Sheets ---
with pd.ExcelWriter('examples/multi_sheet.xlsx') as writer:
    frame.to_excel(writer, sheet_name='Data1', index=False)
    frame.describe().to_excel(writer, sheet_name='Summary')

"""
Key Takeaways:
1. Excel I/O Requirements:
   - xlrd: For reading .xls files
   - openpyxl: For writing .xlsx files
   - Install via: pip install xlrd openpyxl

2. Reading Options:
   - ExcelFile.parse() for multiple sheets
   - read_excel() for single sheets
   - Specify sheet names/indices
   - Handle headers and skiprows

3. Writing Options:
   - ExcelWriter for multiple sheets
   - Direct to_excel() for simple writes
   - Control index inclusion/exclusion

4. Best Practices:
   - Use context managers (with) for writers
   - Verify file extensions match engine
   - Handle exceptions for missing files/dependencies
   - Prefer .xlsx format for new files

5. Performance Notes:
   - ExcelFile faster for multiple sheet reads
   - Set index=False to avoid extra index column
   - Use converters for data type control
"""

In [None]:
import pandas as pd
import requests

# --- GitHub API Example ---
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

try:
    # Make GET request to GitHub API
    response = requests.get(url)
    response.raise_for_status()  # Raise HTTP errors
    
    # Parse JSON response
    data = response.json()
    
    # Create DataFrame from selected fields
    issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
    print("First 5 GitHub Issues:\n", issues.head())
    
    # Show labels from first issue
    print("\nLabels from first issue:\n", issues['labels'].iloc[0])

except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

# --- Handling Pagination Example ---
def get_all_issues():
    """Fetch all issues (handling pagination)"""
    all_issues = []
    page = 1
    while True:
        params = {'page': page, 'per_page': 100}
        resp = requests.get(url, params=params)
        current_data = resp.json()
        if not current_data:
            break
        all_issues.extend(current_data)
        page += 1
    return pd.DataFrame(all_issues)

# Uncomment to test pagination (may hit rate limits)
# full_issues = get_all_issues()
# print(f"Total issues: {len(full_issues)}")

"""
Key Takeaways:
1. API Interaction:
   - Use requests.get() to access web APIs
   - response.json() parses JSON to Python objects
   - Handle HTTP errors with response.raise_for_status()

2. DataFrame Construction:
   - Select specific columns using the columns parameter
   - Nested data (like labels) remains as JSON objects in cells

3. Practical Considerations:
   - GitHub API rate limits (authenticate for higher limits)
   - Pagination handling required for large datasets
   - Data cleaning often needed for nested JSON structures

4. Advanced Patterns:
   - Parameterize requests (e.g., page numbers)
   - Use while loops for pagination
   - Combine data from multiple requests

5. Error Handling:
   - Catch RequestException for network issues
   - Validate response status codes
   - Handle potential JSON parsing errors
"""

In [None]:
import sqlite3
import pandas as pd
import sqlalchemy

# --- Create SQLite Database ---
# Establish connection
con = sqlite3.connect('mydata.sqlite')

# Create table
create_table_query = """
CREATE TABLE IF NOT EXISTS test (
    a VARCHAR(20),
    b VARCHAR(20),
    c REAL,
    d INTEGER
);
"""
con.execute(create_table_query)
con.commit()

# Insert data
data = [
    ('Atlanta', 'Georgia', 1.25, 6),
    ('Tallahassee', 'Florida', 2.6, 3),
    ('Sacramento', 'California', 1.7, 5)
]
insert_query = "INSERT INTO test VALUES (?, ?, ?, ?)"
con.executemany(insert_query, data)
con.commit()

# --- Manual Data Retrieval ---
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
columns = [col[0] for col in cursor.description]

manual_df = pd.DataFrame(rows, columns=columns)
print("Manual DataFrame:\n", manual_df)

# --- Using SQLAlchemy ---
engine = sqlalchemy.create_engine('sqlite:///mydata.sqlite')
sql_df = pd.read_sql("SELECT * FROM test", engine)
print("\nSQLAlchemy DataFrame:\n", sql_df)

# --- Cleanup ---
con.close()

"""
Key Takeaways:
1. Database Interaction Workflow:
   - Create connection → execute DDL → insert data → query data
   - Use context managers (with) for automatic connection handling

2. Data Retrieval Methods:
   - Manual: cursor.fetchall() + DataFrame construction
   - Preferred: pd.read_sql() with SQLAlchemy

3. SQLAlchemy Advantages:
   - Database-agnostic connection strings
   - Simplifies query execution and result parsing
   - Works with multiple DBMS (PostgreSQL, MySQL, etc.)

4. Best Practices:
   - Use parameterized queries (security/performance)
   - Explicitly define column names for robustness
   - Close connections after operations
   - Use ORMs (SQLAlchemy) for complex operations

5. pandas Integration:
   - read_sql() automatically handles:
     - Data type conversion
     - Column naming
     - Connection management
"""

CHAPTER 7: Data Cleaning and Preparation

In [None]:
import pandas as pd
import numpy as np

# --- Missing Data Representation ---
# Create Series with NaN (numeric missing value)
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print("Original Series with NaN:\n", string_data)

# Check for missing values
print("\nisnull() detection:\n", string_data.isnull())

# Replace with None (object missing value)
string_data[0] = None
print("\nAfter replacing with None:\n", string_data)
print("isnull() now shows:\n", string_data.isnull())

# --- Handling Missing Data ---
# Fill missing values
filled = string_data.fillna("missing")
print("\nAfter fillna('missing'):\n", filled)

# Drop missing values
cleaned = string_data.dropna()
print("\nAfter dropna():\n", cleaned)

# --- DataFrame Example ---
df = pd.DataFrame({
    'A': [1, 2, np.nan],
    'B': ['X', np.nan, 'Z'],
    'C': [np.nan, np.nan, np.nan]
})

print("\nOriginal DataFrame:\n", df)
print("DataFrame isnull():\n", df.isnull())

# Fill missing values with column mean
df['A'].fillna(df['A'].mean(), inplace=True)
print("\nAfter filling column A with mean:\n", df)

# Drop columns with all NaN
df_clean = df.dropna(axis=1, how='all')
print("\nAfter dropping all-NaN columns:\n", df_clean)

"""
Key Takeaways:
1. Missing Data Representation:
   - NaN (np.nan) for numeric missing values
   - None treated as NA in object types
   - Both detected by isnull()/notnull()

2. Core Methods:
   - isnull(): Identify missing values
   - notnull(): Inverse of isnull()
   - dropna(): Filter missing data
      - how: 'any' (default) or 'all'
      - thresh: Minimum non-NA values
   - fillna(): Replace missing values
      - Value, method (ffill/bfill), or interpolation

3. Important Notes:
   - Aggregations (mean, sum) automatically exclude NaN
   - Object columns preserve None/np.nan distinction
   - Always verify after fill/drop operations
   - Use inplace=True for direct modification

4. Best Practices:
   - Analyze missing data patterns
   - Choose appropriate filling strategy (mean, median, etc.)
   - Consider domain knowledge for imputation
   - Document handling decisions for reproducibility
"""

In [None]:
import pandas as pd
import numpy as np

# --- Filtering Missing Data ---
# Create sample Series with NaN
data_series = pd.Series([1, np.nan, 3.5, np.nan, 7])
print("Original Series:\n", data_series)

# Drop NaN values from Series
print("\nAfter dropna():\n", data_series.dropna())
print("\nUsing boolean indexing:\n", data_series[data_series.notnull()])

# Create sample DataFrame with NaN
data_df = pd.DataFrame([
    [1.0, 6.5, 3.0],
    [1.0, np.nan, np.nan],
    [np.nan, np.nan, np.nan],
    [np.nan, 6.5, 3.0]
])

print("\nOriginal DataFrame:\n", data_df)

# Drop rows with any NaN
print("\nDrop rows with any NaN:\n", data_df.dropna())

# Drop rows where ALL values are NaN
print("\nDrop rows where all NaN:\n", data_df.dropna(how='all'))

# Add column of NaN and drop columns where all NaN
data_df[3] = np.nan
print("\nDataFrame with new NaN column:\n", data_df)
print("\nDrop columns where all NaN:\n", data_df.dropna(axis=1, how='all'))

# Keep rows with at least 2 non-null values
df_random = pd.DataFrame(np.random.randn(7, 3))
df_random.iloc[:4, 1] = np.nan
df_random.iloc[:2, 2] = np.nan
print("\nRandom DataFrame with NaN:\n", df_random)
print("\nDrop rows with <2 non-null values:\n", df_random.dropna(thresh=2))

# --- Filling Missing Data ---
# Fill NaN with constant
print("\nFill NaN with 0:\n", df_random.fillna(0))

# Fill different values per column
print("\nFill with column-specific values:\n", df_random.fillna({1: 0.5, 2: -1}))

# Forward fill with limit
df_ffill = pd.DataFrame({
    0: [1, np.nan, np.nan, 4, np.nan],
    1: [np.nan, 2, np.nan, 5, 6]
})
print("\nOriginal for ffill:\n", df_ffill)
print("\nForward fill (limit=1):\n", df_ffill.fillna(method='ffill', limit=1))

# Fill with mean value
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
print("\nSeries filled with mean:\n", data.fillna(data.mean()))

"""
Key Takeaways:
1. Filtering Options:
   - dropna() removes missing values
   - how='any' (default) vs how='all'
   - thresh parameter sets minimum non-null counts
   - axis parameter controls row/column filtering

2. Filling Strategies:
   - Scalar values or column-specific mappings
   - method='ffill'/'bfill' for propagation
   - limit parameter controls fill continuity
   - Statistical fills (mean/median) for numeric data

3. Performance Considerations:
   - inplace=True modifies original data
   - Interpolation methods (linear, time) for time series
   - Prefer fillna() over manual loops for efficiency

4. Best Practices:
   - Analyze missing data patterns first
   - Document imputation strategies
   - Validate results after operations
   - Consider domain knowledge for appropriate fills
"""

In [None]:
import pandas as pd
import numpy as np

# --- Removing Duplicates ---
data = pd.DataFrame({
    'k1': ['one', 'two'] * 3 + ['two'],
    'k2': [1, 1, 2, 3, 3, 4, 4]
})

print("Original DataFrame with duplicates:")
print(data)

# Check duplicates
print("\nDuplicate indicators:\n", data.duplicated())

# Drop duplicates
print("\nAfter dropping duplicates:\n", data.drop_duplicates())

# Drop duplicates based on subset
data['v1'] = range(7)
print("\nDataFrame with new column:\n", data)
print("\nDrop duplicates by 'k1':\n", data.drop_duplicates(['k1']))

# Keep last occurrence
print("\nKeep last duplicates:\n", data.drop_duplicates(['k1', 'k2'], keep='last'))

# --- Mapping Transformations ---
meat_data = pd.DataFrame({
    'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 
             'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]
})

meat_to_animal = {
    'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow',
    'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'
}

# Normalize and map
meat_data['animal'] = meat_data['food'].str.lower().map(meat_to_animal)
print("\nMeat data with animal mapping:\n", meat_data)

# Alternative using lambda
meat_data['animal_lambda'] = meat_data['food'].map(lambda x: meat_to_animal[x.lower()])
print("\nUsing lambda mapping:\n", meat_data[['food', 'animal_lambda']])

# --- Replacing Values ---
data_series = pd.Series([1., -999., 2., -999., -1000., 3.])
print("\nOriginal Series:\n", data_series)

# Replace single value
print("\nReplace -999 with NaN:\n", data_series.replace(-999, np.nan))

# Replace multiple values
print("\nReplace multiple values:\n", data_series.replace([-999, -1000], np.nan))
print("\nDifferent replacements:\n", data_series.replace({-999: np.nan, -1000: 0}))

# --- Renaming Axis Indexes ---
df = pd.DataFrame(np.arange(12).reshape((3,4)),
                  index=['Ohio', 'Colorado', 'New York'],
                  columns=['one', 'two', 'three', 'four'])

# Transform index
print("\nOriginal DataFrame:\n", df)
df.index = df.index.map(lambda x: x[:4].upper())
print("\nModified index:\n", df)

# Rename columns/indices
renamed_df = df.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})
print("\nRenamed DataFrame:\n", renamed_df)

# In-place rename
df.rename(index={'COLO': 'TEXAS'}, inplace=True)
print("\nIn-place rename:\n", df)

"""
Key Takeaways:
1. Duplicate Handling:
   - duplicated() identifies row duplicates
   - drop_duplicates() removes duplicates
   - subset parameter targets specific columns
   - keep='last' retains last occurrence

2. Data Mapping:
   - map() applies element-wise transformations
   - Use str.lower() for case normalization
   - Dictionary mappings handle value replacements
   - Lambda functions provide inline transformation logic

3. Value Replacement:
   - replace() handles multiple value substitutions
   - List input for multiple -> single replacement
   - Dictionary input for targeted replacements
   - Differs from str.replace() (string pattern substitution)

4. Renaming Operations:
   - map() modifies index/column labels
   - rename() creates transformed versions
   - Works with functions or dictionaries
   - inplace=True modifies original object

5. Best Practices:
   - Normalize data before mapping
   - Verify replacements with dry runs
   - Use descriptive names when renaming
   - Prefer vectorized operations over loops
"""

In [None]:
# Python and Pandas String Manipulation Guide
import numpy as np
import pandas as pd

# =============================================
# 1. Python String Methods
# =============================================
sample_str = '  a,b, guido  '
print("Original string:", repr(sample_str))

# Basic splitting and stripping
split_result = sample_str.split(',')
stripped_pieces = [x.strip() for x in split_result]
joined_str = '::'.join(stripped_pieces)

print("\n1. Splitting and joining:")
print("Split result:", split_result)
print("Stripped pieces:", stripped_pieces)
print("Joined string:", joined_str)

# Search and replace methods
print("\n2. Search and replace:")
print("Contains 'guido':", 'guido' in sample_str)
print("Find comma position:", sample_str.find(','))
print("Count commas:", sample_str.count(','))
print("Replace commas:", sample_str.replace(',', '|'))

# Case conversion and validation
print("\n3. Case methods:")
print("Uppercase:", sample_str.upper())
print("Starts with 'a':", sample_str.strip().startswith('a'))
print("Ends with 'do':", sample_str.strip().endswith('do'))

# =============================================
# 2. Pandas String Operations
# =============================================
data = pd.Series(['  alice ', 'bob  ', 'carol', None, '  dave'])
print("\nPandas Series before cleaning:\n", data)

# Pandas string methods with na handling
print("\n4. Pandas string operations:")
print("Stripped whitespace:\n", data.str.strip())
print("Uppercase:\n", data.str.upper())
print("Contains 'o':\n", data.str.contains('o', na=False))
print("First character:\n", data.str[0])
print("Lengths:\n", data.str.len())

# Split into DataFrame
split_df = data.str.split('a', expand=True)
print("\nSplit on 'a':\n", split_df)

# =============================================
# 3. Handling Missing Data
# =============================================
print("\n5. Handling missing values:")
print("Default NaN handling:\n", data.str.upper())
print("With fillna:\n", data.str.upper().fillna('MISSING'))

# Safe operations with na_action
print("\nSafe contains check:\n", 
      data.str.contains('o', na_action='ignore'))

# =============================================
# 4. Regular Expressions
# =============================================
emails = pd.Series([
    'john.doe@example.com',
    'invalid_email',
    'sarah.smith@company.org',
    np.nan
])

print("\n6. Regular expression examples:")
# Extract domains where valid
domains = emails.str.extract(r'@([\w.]+)', expand=False)
print("Extracted domains:\n", domains)

# Validate email format
valid_emails = emails.str.match(r'^[\w.]+@[\w]+\.[\w]{2,3}$')
print("Valid emails:\n", valid_emails)

# =============================================
# 5. Advanced DataFrame Operations
# =============================================
df = pd.DataFrame({
    'name': [' Alice ', 'Bob ', 'Charlie', None],
    'address': ['123 Main St', '456 Oak Ave', '789 Pine Rd', np.nan]
})

print("\n7. DataFrame string operations:")
# Clean entire DataFrame
df_clean = df.apply(lambda col: col.str.strip() if col.dtype == 'object' else col)
df_clean['name'] = df_clean['name'].str.upper()
print("Cleaned DataFrame:\n", df_clean)

# =============================================
# Key Takeaways
# =============================================
"""
STRING MANIPULATION ESSENTIALS:

1. Core Python Methods:
   - split/join/strip for basic cleaning
   - find/index for substring search
   - replace for pattern substitution
   - Case methods: lower/upper/casefold

2. Pandas Enhancements:
   - .str accessor for vectorized operations
   - na_action parameter for missing values
   - Regular expression integration
   - DataFrame-wide operations

3. Regular Expressions:
   - match() for pattern validation
   - extract() for pattern capture
   - replace() with regex substitution

4. Best Practices:
   - Always consider whitespace cleaning
   - Handle missing values explicitly
   - Use vectorized pandas methods instead of loops
   - Precompile regex patterns for repeated use

5. Common Pitfalls:
   - Forgetting to strip whitespace
   - Case sensitivity in searches
   - NaN handling in pandas operations
   - Overlooking regex special characters
"""

In [None]:
import pandas as pd
import numpy as np

# --- Discretization and Binning ---
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

# Using pd.cut to bin continuous data
cats = pd.cut(ages, bins)
print("\nCategorical Bins:\n", cats)
print("\nBin Codes:\n", cats.codes)
print("\nBin Categories:\n", cats.categories)
print("\nBin Counts:\n", pd.value_counts(cats))

# Customizing bin labels
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
labeled_cats = pd.cut(ages, bins, labels=group_names)
print("\nLabeled Bins:\n", labeled_cats)

# Using pd.qcut for quantile-based binning
data = np.random.randn(1000)
quartiles = pd.qcut(data, 4)
print("\nQuartile Bins:\n", quartiles)
print("\nQuartile Counts:\n", pd.value_counts(quartiles))

# --- Detecting and Filtering Outliers ---
data_df = pd.DataFrame(np.random.randn(1000, 4))
outliers = data_df[(np.abs(data_df) > 3).any(axis=1)]
print("\nOutliers:\n", outliers)

# Capping values outside the range [-3, 3]
data_df[np.abs(data_df) > 3] = np.sign(data_df) * 3
print("\nCapped Data Summary:\n", data_df.describe())

# --- Permutation and Random Sampling ---
df = pd.DataFrame(np.arange(20).reshape(5, 4))
sampler = np.random.permutation(5)
permuted_df = df.take(sampler)
print("\nPermuted DataFrame:\n", permuted_df)

# Random sampling without replacement
sampled_df = df.sample(n=3)
print("\nSampled DataFrame (No Replacement):\n", sampled_df)

# Random sampling with replacement
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
print("\nRandom Draws (With Replacement):\n", draws)

# --- Computing Indicator/Dummy Variables ---
df_key = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
dummy_vars = pd.get_dummies(df_key['key'], prefix='key')
df_with_dummy = df_key[['data1']].join(dummy_vars)
print("\nDataFrame with Dummy Variables:\n", df_with_dummy)

# Multi-category indicator variables
movies_data = pd.DataFrame({
    'movie_id': [1, 2, 3],
    'title': ['Movie A', 'Movie B', 'Movie C'],
    'genres': ['Action|Drama', 'Comedy|Romance', 'Action|Thriller']
})
all_genres = set('|'.join(movies_data['genres']).split('|'))
zero_matrix = np.zeros((len(movies_data), len(all_genres)))
dummies = pd.DataFrame(zero_matrix, columns=sorted(all_genres))

for i, genre_list in enumerate(movies_data['genres']):
    indices = dummies.columns.get_indexer(genre_list.split('|'))
    dummies.iloc[i, indices] = 1

movies_with_indicators = movies_data.join(dummies.add_prefix('Genre_'))
print("\nMovies with Genre Indicators:\n", movies_with_indicators)

"""
Key Takeaways:
1. Discretization:
   - pd.cut divides continuous data into bins.
   - pd.qcut creates quantile-based bins for equal-sized groups.
   - Custom labels can be applied using the `labels` parameter.

2. Outlier Handling:
   - Use boolean indexing to detect outliers.
   - Cap values outside a range using np.sign and array operations.

3. Random Sampling:
   - np.random.permutation generates random orderings.
   - df.sample supports sampling with or without replacement.

4. Dummy Variables:
   - pd.get_dummies converts categorical variables into binary indicators.
   - For multi-category data, construct a zero matrix and populate it based on category membership.

5. Best Practices:
   - Use vectorized operations for efficiency.
   - Combine discretization with other transformations (e.g., dummy variables) for feature engineering.
"""

CHAPTER 8: Data Wrangling: Join, Combine, and Reshape

In [None]:
import pandas as pd
import numpy as np

# --- Hierarchical Indexing ---
# Creating a Series with a MultiIndex
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
print("\nSeries with Hierarchical Index:\n", data)

# Accessing subsets using partial indexing
print("\nSubset for 'a':\n", data['a'])
print("\nSubset for 'b':\n", data['b'])

# Unstacking and stacking
unstacked = data.unstack()
print("\nUnstacked DataFrame:\n", unstacked)
restacked = unstacked.stack()
print("\nRestacked Series:\n", restacked)

# --- Hierarchical Index in DataFrame ---
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
print("\nDataFrame with Hierarchical Index:\n", frame)

# Sorting by hierarchical index
frame_sorted = frame.sort_index(level=0)
print("\nLexicographically Sorted DataFrame:\n", frame_sorted)

# Summary statistics by level
level_summary = frame.sum(level=0, axis=0)
print("\nSummary Statistics by Level (Rows):\n", level_summary)

# --- Merging DataFrames ---
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

# Many-to-one merge
merged_df = pd.merge(df1, df2, on='key')
print("\nMerged DataFrame (Many-to-One):\n", merged_df)

# Merge with different column names
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
merged_diff_keys = pd.merge(df3, df4, left_on='lkey', right_on='rkey')
print("\nMerged DataFrame (Different Keys):\n", merged_diff_keys)

# --- Setting and Resetting Index ---
frame2 = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                       'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                       'd': [0, 1, 2, 0, 1, 2, 3]})
frame_with_index = frame2.set_index(['c', 'd'])
print("\nDataFrame with Hierarchical Index (set_index):\n", frame_with_index)

# Resetting the index
reset_frame = frame_with_index.reset_index()
print("\nDataFrame After Resetting Index:\n", reset_frame)

"""
Key Takeaways:
1. Hierarchical Indexing:
   - Enables multi-level indexing for both rows and columns.
   - Supports partial indexing and slicing.
   - `unstack()` and `stack()` convert between hierarchical indices and flat structures.

2. Sorting and Aggregation:
   - Sorting by hierarchical levels improves performance.
   - Aggregation functions like `sum()` can operate on specific levels of the index.

3. Merging DataFrames:
   - `pd.merge()` combines DataFrames based on common keys.
   - Handles many-to-one and one-to-many relationships.
   - Allows merging on differently named columns using `left_on` and `right_on`.

4. Setting and Resetting Index:
   - `set_index()` creates hierarchical indices from existing columns.
   - `reset_index()` flattens hierarchical indices back into columns.

5. Best Practices:
   - Use hierarchical indexing for multidimensional data representation.
   - Sort indices before performing operations for better performance.
   - Specify merge keys explicitly to avoid ambiguity.
"""

In [None]:
import pandas as pd
import numpy as np

# --- Merging on Index ---
# Example DataFrames for merging on index
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

print("\nLeft DataFrame (left1):\n", left1)
print("\nRight DataFrame (right1):\n", right1)

# Merge using the index of the right DataFrame as the merge key
merged_index = pd.merge(left1, right1, left_on='key', right_index=True)
print("\nMerge on Index (Inner Join):\n", merged_index)

# Outer join to include all keys
merged_outer = pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
print("\nMerge on Index (Outer Join):\n", merged_outer)

# --- Hierarchical Index Merge ---
lefth = pd.DataFrame({
    'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'key2': [2000, 2001, 2002, 2001, 2002],
    'data': np.arange(5.)
})
righth = pd.DataFrame(
    np.arange(12).reshape((6, 2)),
    index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], [2001, 2000, 2000, 2000, 2001, 2002]],
    columns=['event1', 'event2']
)

print("\nHierarchical Left DataFrame (lefth):\n", lefth)
print("\nHierarchical Right DataFrame (righth):\n", righth)

# Merge on multiple keys with hierarchical index
merged_hierarchical = pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
print("\nMerge on Multiple Keys (Inner Join):\n", merged_hierarchical)

# Outer join for hierarchical index
merged_hierarchical_outer = pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer')
print("\nMerge on Multiple Keys (Outer Join):\n", merged_hierarchical_outer)

# --- Joining DataFrames by Index ---
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

print("\nLeft DataFrame (left2):\n", left2)
print("\nRight DataFrame (right2):\n", right2)

# Join using index
joined_index = left2.join(right2, how='outer')
print("\nJoin on Index (Outer Join):\n", joined_index)

# Join on a column from one DataFrame and the index of another
joined_on_key = left1.join(right1, on='key')
print("\nJoin Using Column and Index:\n", joined_on_key)

# --- Joining Multiple DataFrames ---
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
print("\nAnother DataFrame (another):\n", another)

# Join multiple DataFrames
joined_multiple = left2.join([right2, another])
print("\nJoin Multiple DataFrames (Inner Join):\n", joined_multiple)

# Outer join for multiple DataFrames
joined_multiple_outer = left2.join([right2, another], how='outer')
print("\nJoin Multiple DataFrames (Outer Join):\n", joined_multiple_outer)

"""
Key Takeaways:
1. Merging on Index:
   - Use `left_index=True` or `right_index=True` to merge on index.
   - Combine hierarchical indices with multiple keys for complex merges.

2. Join Method:
   - `DataFrame.join()` is a convenient way to merge by index.
   - Supports joining multiple DataFrames with overlapping or non-overlapping indices.

3. Handling Missing Data:
   - Outer joins include all keys, filling missing values with NaN.
   - Useful for combining datasets with partial overlaps.

4. Combining Multiple DataFrames:
   - Pass a list of DataFrames to `join()` for merging multiple objects.
   - Specify `how='outer'` to include all rows from all DataFrames.

5. Best Practices:
   - Use `merge()` for general-purpose joins with flexibility.
   - Use `join()` for simpler index-based merges.
   - Handle duplicate indices carefully when performing outer joins.
"""

In [None]:
import pandas as pd
import numpy as np

# --- Concatenating Along an Axis ---

# Example NumPy array concatenation
arr = np.arange(12).reshape((3, 4))
print("\nOriginal NumPy Array:\n", arr)
concatenated_arr = np.concatenate([arr, arr], axis=1)
print("\nConcatenated NumPy Array (axis=1):\n", concatenated_arr)

# --- Concatenating pandas Series ---
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

# Concatenate Series along rows (default axis=0)
concat_series_rows = pd.concat([s1, s2, s3])
print("\nConcatenated Series (axis=0):\n", concat_series_rows)

# Concatenate Series along columns (axis=1)
concat_series_cols = pd.concat([s1, s2, s3], axis=1)
print("\nConcatenated Series (axis=1):\n", concat_series_cols)

# Concatenate with inner join
concat_inner_join = pd.concat([s1, s3], axis=1, join='inner')
print("\nConcatenated Series with Inner Join:\n", concat_inner_join)

# Concatenate with specific join_axes
concat_join_axes = pd.concat([s1, s3], axis=1, join_axes=[['a', 'c', 'b', 'e']])
print("\nConcatenated Series with join_axes:\n", concat_join_axes)

# Concatenate with hierarchical index using keys
concat_with_keys = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print("\nConcatenated Series with Hierarchical Index:\n", concat_with_keys)

# Unstack hierarchical index
unstacked_result = concat_with_keys.unstack()
print("\nUnstacked Result:\n", unstacked_result)

# Concatenate Series with keys for column headers
concat_series_keys_cols = pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
print("\nConcatenated Series with Keys as Column Headers:\n", concat_series_keys_cols)

# --- Concatenating pandas DataFrames ---
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])

# Concatenate DataFrames along columns (axis=1) with keys
concat_df_cols_keys = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
print("\nConcatenated DataFrames with Hierarchical Columns:\n", concat_df_cols_keys)

# Concatenate DataFrames using a dictionary for keys
concat_dict_keys = pd.concat({'level1': df1, 'level2': df2}, axis=1)
print("\nConcatenated DataFrames Using Dictionary Keys:\n", concat_dict_keys)

# Concatenate DataFrames with named hierarchical levels
concat_named_levels = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower'])
print("\nConcatenated DataFrames with Named Hierarchical Levels:\n", concat_named_levels)

# Concatenate DataFrames ignoring indexes
df3 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df4 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
concat_ignore_index = pd.concat([df3, df4], ignore_index=True)
print("\nConcatenated DataFrames with Ignored Index:\n", concat_ignore_index)

"""
Key Takeaways:
1. Concatenation Basics:
   - Use `pd.concat()` to combine pandas objects along an axis.
   - Default behavior concatenates along rows (`axis=0`).

2. Handling Non-Overlapping Indices:
   - By default, concatenation uses the union of indices (`outer join`).
   - Use `join='inner'` to retain only overlapping indices.

3. Hierarchical Indexing:
   - Use the `keys` argument to create a hierarchical index on the concatenation axis.
   - Specify `names` to name hierarchical levels.

4. DataFrame Concatenation:
   - When concatenating DataFrames along columns, `keys` become column headers.
   - Use `ignore_index=True` to reset the index when row indices are irrelevant.

5. Best Practices:
   - Use `join_axes` for precise control over non-concatenation axes.
   - Use hierarchical indexing for structured concatenation results.
   - Reset indices when combining DataFrames with meaningless row indices.
"""

In [None]:
import pandas as pd
import numpy as np

# --- Combining Data with Overlap ---
# Example Series with overlapping indices
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print("\nSeries a:\n", a)
print("\nSeries b:\n", b)

# Combine using NumPy's where function
combined_np = np.where(pd.isnull(a), b, a)
print("\nCombined Using NumPy's where:\n", combined_np)

# Combine using combine_first
combined_series = b[:-2].combine_first(a[2:])
print("\nCombined Using combine_first (Series):\n", combined_series)

# Example DataFrames with overlapping data
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})

print("\nDataFrame df1:\n", df1)
print("\nDataFrame df2:\n", df2)

combined_df = df1.combine_first(df2)
print("\nCombined Using combine_first (DataFrame):\n", combined_df)

# --- Reshaping with Hierarchical Indexing ---
# Reshaping DataFrames using stack and unstack
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'], name='number'))

print("\nOriginal DataFrame:\n", data)

# Stack converts columns into rows
stacked = data.stack()
print("\nStacked Series:\n", stacked)

# Unstack converts rows back into columns
unstacked = stacked.unstack()
print("\nUnstacked DataFrame:\n", unstacked)

# Unstacking a specific level
unstacked_level_0 = stacked.unstack(0)
print("\nUnstacked Level 0:\n", unstacked_level_0)

unstacked_state = stacked.unstack('state')
print("\nUnstacked by State:\n", unstacked_state)

# Handling missing data during stacking and unstacking
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])

print("\nConcatenated Series:\n", data2)

unstacked_data2 = data2.unstack()
print("\nUnstacked Concatenated Series:\n", unstacked_data2)

stacked_data2 = unstacked_data2.stack()
print("\nStacked Data (Default Drop Missing):\n", stacked_data2)

stacked_data2_no_drop = unstacked_data2.stack(dropna=False)
print("\nStacked Data (Keep Missing):\n", stacked_data2_no_drop)

# --- Reshaping MultiIndexed DataFrames ---
df = pd.DataFrame({'left': stacked, 'right': stacked + 5},
                  columns=pd.Index(['left', 'right'], name='side'))
print("\nMultiIndexed DataFrame:\n", df)

# Unstacking a specific level
unstacked_df = df.unstack('state')
print("\nUnstacked by State:\n", unstacked_df)

# Stacking a specific axis
stacked_df = unstacked_df.stack('side')
print("\nStacked by Side:\n", stacked_df)

"""
Key Takeaways:
1. Combining Data with Overlap:
   - Use `combine_first` to patch missing data in one object with values from another.
   - Aligns data based on indices for both Series and DataFrames.

2. Reshaping with Hierarchical Indexing:
   - `stack()` pivots columns into rows, creating a hierarchical index.
   - `unstack()` pivots rows into columns, optionally specifying the level to unstack.
   - Handles missing data gracefully during stacking and unstacking.

3. MultiIndexed DataFrames:
   - Can reshape along specific axes or levels.
   - Provides a flexible way to organize and transform multidimensional data.

4. Best Practices:
   - Use `combine_first` for filling missing data in overlapping datasets.
   - Leverage `stack` and `unstack` for reshaping hierarchical data.
   - Handle missing data explicitly when reshaping to avoid unintended results.
"""

In [None]:
import pandas as pd
import numpy as np

# --- Pivoting "Long" to "Wide" Format ---
# Load example data
data = pd.read_csv('examples/macrodata.csv')

# Create a PeriodIndex for date and select specific columns
periods = pd.PeriodIndex(year=data['year'], quarter=data['quarter'], name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')

# Stack the data into long format and reset index
ldata = data.stack().reset_index().rename(columns={0: 'value'})

print("\nLong Format Data (ldata):\n", ldata[:10])

# Pivot the long format data into wide format
pivoted = ldata.pivot('date', 'item', 'value')
print("\nPivoted Wide Format Data:\n", pivoted[:5])

# Add an additional value column and pivot again
ldata['value2'] = np.random.randn(len(ldata))
pivoted_with_value2 = ldata.pivot('date', 'item')
print("\nPivoted with Hierarchical Columns:\n", pivoted_with_value2[:5])

# Alternative method using set_index and unstack
unstacked = ldata.set_index(['date', 'item']).unstack('item')
print("\nUnstacked Data (Alternative Method):\n", unstacked[:5])

# --- Pivoting "Wide" to "Long" Format ---
# Create a sample DataFrame
df = pd.DataFrame({
    'key': ['foo', 'bar', 'baz'],
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})

print("\nOriginal Wide Format DataFrame:\n", df)

# Melt the DataFrame to long format
melted = pd.melt(df, id_vars=['key'])
print("\nMelted Long Format DataFrame:\n", melted)

# Pivot back to wide format
reshaped = melted.pivot('key', 'variable', 'value')
print("\nReshaped Back to Wide Format:\n", reshaped)

# Reset index to move the row labels back into a column
reshaped_reset = reshaped.reset_index()
print("\nReshaped with Index Reset:\n", reshaped_reset)

# Melt with a subset of value columns
melted_subset = pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])
print("\nMelted with Subset of Value Columns:\n", melted_subset)

# Melt without group identifiers
melted_no_id = pd.melt(df, value_vars=['A', 'B', 'C'])
print("\nMelted Without Group Identifiers:\n", melted_no_id)

"""
Key Takeaways:
1. Pivoting "Long" to "Wide":
   - Use `pivot()` to reshape long-format data into wide format.
   - Specify row and column indices, and optionally a value column to fill the DataFrame.
   - For multiple value columns, hierarchical columns are created.

2. Pivoting "Wide" to "Long":
   - Use `pd.melt()` to transform wide-format data into long format.
   - Specify group indicators (`id_vars`) and value columns (`value_vars`).
   - Melted data can be reshaped back to wide format using `pivot()`.

3. Alternative Methods:
   - `set_index()` followed by `unstack()` achieves the same result as `pivot()`.

4. Best Practices:
   - Use `pivot()` for clean transformations between long and wide formats.
   - Use `pd.melt()` for combining multiple columns into a single column for analysis.
   - Reset the index after pivoting if the original row labels need to be preserved as a column.
"""

CHAPTER 9: Plotting and Visualization

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- Simple Line Plot ---
# Create data for plotting
data = np.arange(10)

# Plot the data
plt.plot(data)
plt.title("Simple Line Plot")
plt.show()

# --- Figures and Subplots ---
# Create a new figure
fig = plt.figure(figsize=(8, 6))  # Set figure size

# Add subplots to the figure (2x2 grid)
ax1 = fig.add_subplot(2, 2, 1)  # First subplot
ax2 = fig.add_subplot(2, 2, 2)  # Second subplot
ax3 = fig.add_subplot(2, 2, 3)  # Third subplot

# Plot on the first subplot (histogram)
ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)
ax1.set_title("Histogram")

# Plot on the second subplot (scatter plot)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))
ax2.set_title("Scatter Plot")

# Plot on the third subplot (line plot with random cumulative sum)
ax3.plot(np.random.randn(50).cumsum(), 'k--')
ax3.set_title("Line Plot with Cumulative Sum")

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

# --- Using plt.subplots for Grid of Subplots ---
# Create a figure and a grid of subplots (2x3)
fig, axes = plt.subplots(2, 3, figsize=(10, 6), sharex=True, sharey=True)

# Flatten the 2D array of axes for easier iteration
axes = axes.flatten()

# Example: Plot on each subplot
for i, ax in enumerate(axes):
    ax.plot(np.random.randn(50).cumsum())
    ax.set_title(f"Plot {i+1}")

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

"""
Key Takeaways:
1. Creating Figures and Subplots:
   - Use `plt.figure()` to create a new figure.
   - Use `add_subplot()` to add individual subplots to the figure.

2. Plotting Types:
   - Line plots (`plot`): For visualizing trends or relationships.
   - Histograms (`hist`): For visualizing distributions.
   - Scatter plots (`scatter`): For visualizing correlations between variables.

3. Convenience with `plt.subplots`:
   - Use `plt.subplots(nrows, ncols)` to create a grid of subplots.
   - The returned `axes` array allows easy indexing and customization.

4. Sharing Axes:
   - Use `sharex=True` and `sharey=True` to synchronize x-axis or y-axis across subplots.

5. Best Practices:
   - Use `plt.tight_layout()` to automatically adjust subplot spacing.
   - Customize plots with titles, labels, and styles for better readability.
   - Combine multiple plots in a single cell in Jupyter notebooks for consistent visualization.

6. Additional Resources:
   - Refer to the matplotlib gallery and documentation for advanced features and plot types.
"""

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- Adjusting Spacing Around Subplots ---
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(8, 6))

# Plot histograms on each subplot
for i in range(2):
    for j in range(2):
        axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0, hspace=0)
plt.suptitle("Subplots with No Spacing")
plt.show()

# --- Colors, Markers, and Line Styles ---
# Generate random data
data = np.random.randn(30).cumsum()

# Plot with green dashed line and markers
plt.figure(figsize=(8, 4))
plt.plot(data, 'g--', label='Green Dashed Line')
plt.plot(data, color='k', linestyle='dashed', marker='o', label='Explicit Style')
plt.plot(data, 'k-', drawstyle='steps-post', label='Steps-Post')
plt.legend(loc='best')
plt.title("Line Plots with Different Styles")
plt.show()

# --- Ticks, Labels, and Legends ---
# Create a simple plot
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(8, 4))
plt.plot(x, y, label='Sine Wave')

# Adjust ticks and labels using pyplot interface
plt.xlim(0, 10)  # Set x-axis limits
plt.xticks([0, 2.5, 5, 7.5, 10])  # Set custom tick locations
plt.xlabel("X-Axis")  # Add x-axis label
plt.ylabel("Y-Axis")  # Add y-axis label

# Add legend
plt.legend(loc='upper right')

# Alternatively, use object-oriented API
ax = plt.gca()  # Get current AxesSubplot
ax.set_xlim(0, 10)  # Set x-axis limits
ax.set_xticks([0, 2.5, 5, 7.5, 10])  # Set custom tick locations
ax.set_xlabel("X-Axis (Object-Oriented)")  # Add x-axis label
ax.set_ylabel("Y-Axis (Object-Oriented)")  # Add y-axis label

plt.title("Ticks, Labels, and Legends")
plt.show()

"""
Key Takeaways:
1. Adjusting Spacing Around Subplots:
   - Use `plt.subplots_adjust()` to control spacing between subplots.
   - Parameters like `wspace` and `hspace` adjust horizontal and vertical spacing.

2. Colors, Markers, and Line Styles:
   - Specify colors, markers, and line styles using shorthand strings (e.g., 'g--') or explicitly (e.g., `color='g', linestyle='--'`).
   - Use `drawstyle` to change how points are connected (e.g., `steps-post`).

3. Ticks, Labels, and Legends:
   - Use `plt.xlim()`, `plt.xticks()`, and similar methods to control axis limits and ticks.
   - Add labels and legends using `plt.xlabel()`, `plt.ylabel()`, and `plt.legend()`.
   - The object-oriented API provides more explicit control over subplot properties.

4. Best Practices:
   - Use `subplots_adjust` to prevent overlapping elements when working with multiple subplots.
   - Prefer explicit styling for programmatic plots to ensure clarity and maintainability.
   - Use legends to identify different plot components, especially when comparing multiple datasets.
"""

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- Random Walk Plot ---
# Create a figure and subplot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1, 1, 1)

# Generate random walk data
random_walk = np.random.randn(1000).cumsum()
ax.plot(random_walk)

# Customize x-axis ticks and labels
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['One', 'Two', 'Three', 'Four', 'Five'],
                            rotation=30, fontsize='small')

# Set title and x-axis label
ax.set_title('My First Matplotlib Plot')
ax.set_xlabel('Stages')

# Alternatively, batch set properties using a dictionary
props = {
    'title': 'My First Matplotlib Plot',
    'xlabel': 'Stages'
}
ax.set(**props)

# Display the plot
plt.tight_layout()
plt.show()

# --- Adding Legends ---
# Create another figure and subplot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(1, 1, 1)

# Plot multiple lines with labels
ax.plot(np.random.randn(1000).cumsum(), 'k', label='Line One')
ax.plot(np.random.randn(1000).cumsum(), 'k--', label='Line Two')
ax.plot(np.random.randn(1000).cumsum(), 'k.', label='Line Three')

# Add a legend
ax.legend(loc='best')

# Set title and axis labels
ax.set_title('Plot with Legends')
ax.set_xlabel('X-Axis')
ax.set_ylabel('Y-Axis')

# Display the plot
plt.tight_layout()
plt.show()

"""
Key Takeaways:
1. Setting Titles, Axis Labels, Ticks, and Tick Labels:
   - Use `set_title()` to add a title to the plot.
   - Use `set_xlabel()` and `set_ylabel()` to label the axes.
   - Use `set_xticks()` and `set_xticklabels()` to customize tick locations and labels.
   - Use `rotation` and `fontsize` in `set_xticklabels()` to adjust tick label appearance.

2. Batch Setting Properties:
   - Use a dictionary with `set()` to apply multiple properties at once.

3. Adding Legends:
   - Pass the `label` argument when plotting to assign labels to plot elements.
   - Use `ax.legend()` or `plt.legend()` to display the legend.
   - The `loc` parameter controls the legend's position; `'best'` automatically chooses a non-overlapping location.

4. Best Practices:
   - Use `plt.tight_layout()` to prevent overlapping elements in the layout.
   - Customize tick labels for better readability, especially with long labels.
   - Include legends to identify different plot components, especially when comparing multiple datasets.
"""

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

# --- Annotations and Drawing on a Subplot ---
# Load S&P 500 data
data = pd.read_csv('examples/spx.csv', index_col=0, parse_dates=True)
spx = data['SPX']

# Create a figure and subplot
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(1, 1, 1)

# Plot the S&P 500 closing prices
spx.plot(ax=ax, style='k-', label='S&P 500')

# Define important dates for annotation
crisis_data = [
    (datetime(2007, 10, 11), 'Peak of bull market'),
    (datetime(2008, 3, 12), 'Bear Stearns Fails'),
    (datetime(2008, 9, 15), 'Lehman Bankruptcy')
]

# Add annotations with arrows
for date, label in crisis_data:
    ax.annotate(
        label,
        xy=(date, spx.asof(date) + 75),  # Arrow point
        xytext=(date, spx.asof(date) + 225),  # Text location
        arrowprops=dict(facecolor='black', headwidth=4, width=2, headlength=4),
        horizontalalignment='left',
        verticalalignment='top'
    )

# Set plot limits and title
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600, 1800])
ax.set_title('Important Dates in the 2008-2009 Financial Crisis')
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

# --- Drawing Shapes on a Subplot ---
# Create a new figure and subplot
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(1, 1, 1)

# Define shapes: Rectangle, Circle, and Polygon
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]], color='g', alpha=0.5)

# Add shapes to the subplot
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

# Set axis limits
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_title('Data Visualization with Shapes')

# Display the plot
plt.tight_layout()
plt.show()

# --- Saving Plots to File ---
# Save the first plot to a file
plt.figure(figsize=(10, 6))
spx.plot(style='k-', label='S&P 500')
plt.title('S&P 500 Closing Prices')
plt.legend()

# Save as SVG
plt.savefig('spx_plot.svg')

# Save as PNG with high DPI and tight bounding box
plt.savefig('spx_plot.png', dpi=400, bbox_inches='tight')

# Save to a BytesIO buffer
from io import BytesIO
buffer = BytesIO()
plt.savefig(buffer, format='png')
plot_data = buffer.getvalue()

"""
Key Takeaways:
1. Annotations and Drawing:
   - Use `ax.annotate()` to add text and arrows to highlight specific points on a plot.
   - Customize annotations with properties like `xy`, `xytext`, and `arrowprops`.
   - Use `ax.add_patch()` to draw shapes like rectangles, circles, and polygons.

2. Saving Plots:
   - Use `plt.savefig()` to save plots to files in various formats (e.g., PNG, SVG, PDF).
   - Specify options like `dpi` for resolution and `bbox_inches='tight'` to trim whitespace.
   - Save plots to memory using a `BytesIO` buffer for further processing.

3. Best Practices:
   - Use `set_xlim` and `set_ylim` to manually control plot boundaries.
   - Combine annotations and shapes to create informative and visually appealing visualizations.
   - Save plots in high-resolution formats for publication-quality graphics.
"""

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# --- Matplotlib Configuration ---
# Customize global parameters using plt.rc
plt.rc('figure', figsize=(10, 6))  # Set default figure size
font_options = {
    'family': 'monospace',
    'weight': 'bold',
    'size': 'small'
}
plt.rc('font', **font_options)  # Customize font settings

# --- Line Plots with pandas ---
# Create a Series and plot it
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
plt.figure(figsize=(8, 4))
s.plot(label='Random Walk', style='ko-', alpha=0.7)
plt.title("Simple Series Plot")
plt.xlabel("Index")
plt.ylabel("Value")
plt.legend()
plt.grid(True)
plt.show()

# Create a DataFrame and plot multiple lines
df = pd.DataFrame(
    np.random.randn(10, 4).cumsum(axis=0),
    columns=['A', 'B', 'C', 'D'],
    index=np.arange(0, 100, 10)
)
plt.figure(figsize=(10, 6))
df.plot(style=['r--', 'g-.', 'b:', 'k-'], alpha=0.7)
plt.title("Simple DataFrame Plot")
plt.xlabel("Index")
plt.ylabel("Value")
plt.legend(title="Legend")
plt.grid(True)
plt.show()

# --- Using Seaborn for Enhanced Aesthetics ---
# Import seaborn to modify default styles
sns.set_style("whitegrid")  # Use a built-in seaborn style

# Plot the same DataFrame with seaborn aesthetics
plt.figure(figsize=(10, 6))
df.plot(style=['r--', 'g-.', 'b:', 'k-'], alpha=0.7)
plt.title("DataFrame Plot with Seaborn Aesthetics")
plt.xlabel("Index")
plt.ylabel("Value")
plt.legend(title="Legend")
plt.grid(True)
plt.show()

"""
Key Takeaways:
1. Matplotlib Configuration:
   - Use `plt.rc` to customize global parameters like figure size, fonts, and grid styles.
   - Modify the matplotlibrc file for persistent configuration changes.

2. Line Plots with pandas:
   - `Series.plot()` creates line plots by default, using the index for the x-axis.
   - `DataFrame.plot()` plots each column as a separate line on the same subplot, with an automatic legend.
   - Additional keyword arguments (e.g., `style`, `alpha`, `xticks`) allow further customization.

3. Seaborn for Enhanced Aesthetics:
   - Importing seaborn modifies default matplotlib styles for better readability and visual appeal.
   - Use `sns.set_style()` to apply built-in themes like "whitegrid" or "darkgrid."

4. Best Practices:
   - Customize plots programmatically using matplotlib's API or pandas' built-in methods.
   - Use seaborn for quick aesthetic improvements without extensive manual configuration.
   - Leverage pandas' `subplots=True` option for plotting DataFrame columns in separate subplots when needed.
"""

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# --- Bar Plots with pandas ---
# Create a Series for bar plots
data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop'))

# Plot vertical and horizontal bar plots
fig, axes = plt.subplots(2, 1, figsize=(8, 8))
data.plot.bar(ax=axes[0], color='k', alpha=0.7)
data.plot.barh(ax=axes[1], color='k', alpha=0.7)
axes[0].set_title("Vertical Bar Plot")
axes[1].set_title("Horizontal Bar Plot")
plt.tight_layout()
plt.show()

# Create a DataFrame for grouped bar plots
df = pd.DataFrame(
    np.random.rand(6, 4),
    index=['one', 'two', 'three', 'four', 'five', 'six'],
    columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus')
)

# Plot grouped bar plot
plt.figure(figsize=(8, 4))
df.plot.bar(alpha=0.7)
plt.title("DataFrame Grouped Bar Plot")
plt.tight_layout()
plt.show()

# Plot stacked bar plot
plt.figure(figsize=(8, 4))
df.plot.barh(stacked=True, alpha=0.5)
plt.title("DataFrame Stacked Bar Plot")
plt.tight_layout()
plt.show()

# --- Visualizing Value Frequencies ---
# Example: Frequency of values in a Series
s = pd.Series(np.random.choice(['a', 'b', 'c'], size=100))
s.value_counts().plot.bar(color='k', alpha=0.7)
plt.title("Frequency of Values in Series")
plt.tight_layout()
plt.show()

# --- Cross-Tabulation and Normalized Bar Plots ---
# Load tipping dataset
tips = pd.read_csv('examples/tips.csv')

# Create a cross-tabulation of day vs. party size
party_counts = pd.crosstab(tips['day'], tips['size'])

# Focus on party sizes between 2 and 5
party_counts = party_counts.loc[:, 2:5]

# Normalize each row to sum to 1
party_pcts = party_counts.div(party_counts.sum(axis=1), axis=0)

# Plot normalized bar plot
plt.figure(figsize=(8, 4))
party_pcts.plot.bar()
plt.title("Fraction of Parties by Size on Each Day")
plt.tight_layout()
plt.show()

# --- Bar Plots with seaborn ---
# Add tip percentage column
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])

# Bar plot of tip percentage by day
plt.figure(figsize=(8, 4))
sns.barplot(x='tip_pct', y='day', data=tips, orient='h')
plt.title("Tipping Percentage by Day with Error Bars")
plt.tight_layout()
plt.show()

# Bar plot with hue for additional categorical variable (time)
plt.figure(figsize=(8, 4))
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')
plt.title("Tipping Percentage by Day and Time")
plt.tight_layout()
plt.show()

# Customize seaborn style
sns.set(style="whitegrid")

"""
Key Takeaways:
1. Bar Plots with pandas:
   - Use `plot.bar()` and `plot.barh()` for vertical and horizontal bar plots.
   - For DataFrames, bars are grouped by default; use `stacked=True` for stacked bars.

2. Visualizing Value Frequencies:
   - Use `value_counts()` to compute frequencies and visualize them with `plot.bar()`.

3. Cross-Tabulation and Normalization:
   - Use `pd.crosstab()` to create cross-tabulations.
   - Normalize rows using `div()` and `.sum()` to compare proportions across categories.

4. Bar Plots with seaborn:
   - Seaborn simplifies plotting aggregated data with `sns.barplot()`.
   - Use the `hue` parameter to split bars by an additional categorical variable.
   - Seaborn provides enhanced aesthetics and confidence intervals by default.

5. Best Practices:
   - Use normalization when comparing proportions across categories.
   - Leverage seaborn for quick, aesthetically pleasing visualizations with minimal code.
   - Customize styles using `sns.set()` for consistent appearance across plots.
"""

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# --- Histograms and Density Plots ---
# Load tipping dataset
tips = pd.read_csv('examples/tips.csv')

# Add tip percentage column
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])

# Plot histogram of tip percentages
plt.figure(figsize=(8, 4))
tips['tip_pct'].plot.hist(bins=50, color='k', alpha=0.7)
plt.title("Histogram of Tip Percentages")
plt.xlabel("Tip Percentage")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# Plot density plot of tip percentages
plt.figure(figsize=(8, 4))
tips['tip_pct'].plot.density(color='k')
plt.title("Density Plot of Tip Percentages")
plt.xlabel("Tip Percentage")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

# Use seaborn's distplot for combined histogram and density plot
plt.figure(figsize=(8, 4))
sns.distplot(tips['tip_pct'], bins=50, color='k')
plt.title("Histogram and Density Plot of Tip Percentages")
plt.xlabel("Tip Percentage")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

# --- Scatter Plots ---
# Load macrodata dataset
macro = pd.read_csv('examples/macrodata.csv')
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
trans_data = np.log(data).diff().dropna()

# Scatter plot with regression line
plt.figure(figsize=(8, 4))
sns.regplot('m1', 'unemp', data=trans_data, scatter_kws={'alpha': 0.5})
plt.title("Scatter Plot: Changes in log(m1) vs log(unemp)")
plt.tight_layout()
plt.show()

# --- Pair Plots ---
# Create pair plot matrix
sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})
plt.suptitle("Pair Plot Matrix of Statsmodels Macro Data", y=1.02)
plt.tight_layout()
plt.show()

# --- Facet Grids and Categorical Data ---
# Bar plot with facet grid by smoker and time
plt.figure(figsize=(12, 6))
sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])
plt.suptitle("Tipping Percentage by Day/Time/Smoker", y=1.02)
plt.tight_layout()
plt.show()

# Expand facet grid by adding rows for time
plt.figure(figsize=(12, 8))
sns.factorplot(x='day', y='tip_pct', row='time', col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])
plt.suptitle("Tipping Percentage by Day; Facet by Time/Smoker", y=1.02)
plt.tight_layout()
plt.show()

# Box plot of tip_pct by day
plt.figure(figsize=(8, 4))
sns.factorplot(x='tip_pct', y='day', kind='box',
               data=tips[tips.tip_pct < 0.5])
plt.title("Box Plot of Tip Percentage by Day")
plt.tight_layout()
plt.show()

"""
Key Takeaways:
1. Histograms and Density Plots:
   - Use `plot.hist()` to create histograms for visualizing value frequency.
   - Use `plot.density()` or `sns.kdeplot()` for kernel density estimates (KDE).
   - Combine histograms and KDEs using `sns.distplot()`.

2. Scatter Plots:
   - Use `sns.regplot()` to create scatter plots with linear regression lines.
   - Scatter plots are useful for examining relationships between two variables.

3. Pair Plots:
   - Use `sns.pairplot()` to visualize pairwise relationships in a dataset.
   - Diagonal elements can display histograms or KDEs for individual variables.

4. Facet Grids and Categorical Data:
   - Use `sns.factorplot()` to create faceted plots for categorical data.
   - Facet grids allow splitting data by additional grouping dimensions.
   - Box plots are effective for summarizing distributions with medians, quartiles, and outliers.

5. Best Practices:
   - Use transparency (`alpha`) to improve readability in dense plots.
   - Customize titles, labels, and legends for better interpretability.
   - Leverage seaborn's built-in functions for quick and aesthetically pleasing visualizations.
"""

CHAPTER 10 - Data Aggregation and Group Operations

In [None]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

# --- Grouping and Aggregation ---
# Group by a single column and compute the mean
grouped_single = df['data1'].groupby(df['key1'])
print("Grouped by 'key1' and computed mean:\n", grouped_single.mean())

# Group by multiple columns and compute the mean
grouped_multiple = df['data1'].groupby([df['key1'], df['key2']]).mean()
print("\nGrouped by 'key1' and 'key2' and computed mean:\n", grouped_multiple)

# Unstack the hierarchical index
print("\nUnstacked result:\n", grouped_multiple.unstack())

# Group using external arrays
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
grouped_external = df['data1'].groupby([states, years]).mean()
print("\nGrouped by external arrays (states and years):\n", grouped_external)

# --- Aggregating Multiple Columns ---
# Group by a single column and aggregate all numeric columns
grouped_all_numeric = df.groupby('key1').mean()
print("\nGrouped by 'key1' and aggregated all numeric columns:\n", grouped_all_numeric)

# Group by multiple columns and aggregate all numeric columns
grouped_multi_numeric = df.groupby(['key1', 'key2']).mean()
print("\nGrouped by 'key1' and 'key2' and aggregated all numeric columns:\n", grouped_multi_numeric)

# --- Group Sizes ---
# Get group sizes
group_sizes = df.groupby(['key1', 'key2']).size()
print("\nGroup sizes:\n", group_sizes)

# --- Iterating Over Groups ---
# Iterate over groups
print("\nIterating over groups:")
for name, group in df.groupby('key1'):
    print(name)
    print(group)

# Iterate over groups with multiple keys
print("\nIterating over groups with multiple keys:")
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

# Create a dictionary of groups
pieces = dict(list(df.groupby('key1')))
print("\nDictionary of groups:\n", pieces)

# --- Grouping on Other Axes ---
# Group columns by dtype
grouped_by_dtype = df.groupby(df.dtypes, axis=1)
print("\nGrouped columns by dtype:")
for dtype, group in grouped_by_dtype:
    print(dtype)
    print(group)

# --- Selecting Columns for Aggregation ---
# Select a single column for aggregation
grouped_data2 = df.groupby(['key1', 'key2'])['data2'].mean()
print("\nMean of 'data2' grouped by 'key1' and 'key2':\n", grouped_data2)

# Select multiple columns for aggregation
grouped_subset = df.groupby(['key1', 'key2'])[['data2']].mean()
print("\nMean of 'data2' as a DataFrame grouped by 'key1' and 'key2':\n", grouped_subset)

In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# =============================================
# 1. Create Sample DataFrames
# =============================================

# Create main DataFrame with random data
people = pd.DataFrame(
    np.random.randn(8, 6),
    columns=['a', 'b', 'c', 'd', 'e', 'f'],
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis', 'Alice', 'Bob', 'Carol']
)

# Introduce some NaN values
people.iloc[2:4, [1, 2]] = np.nan
people.iloc[5, 3:] = np.nan

# Create tipping dataset example
data = {
    'total_bill': [16.99, 10.34, 21.01, 23.68, 24.59, 25.29],
    'tip': [1.01, 1.66, 3.50, 3.31, 3.61, 4.71],
    'sex': ['Female', 'Male', 'Male', 'Male', 'Female', 'Female'],
    'smoker': ['No', 'Yes', 'No', 'No', 'No', 'Yes'],
    'day': ['Sun', 'Sun', 'Sun', 'Sun', 'Sun', 'Sun'],
    'time': ['Dinner', 'Dinner', 'Dinner', 'Dinner', 'Dinner', 'Dinner'],
    'size': [2, 3, 3, 2, 4, 4]
}
tips = pd.DataFrame(data)
tips['tip_pct'] = tips['tip'] / tips['total_bill']

# Create MultiIndex DataFrame
arrays = [
    ['US', 'US', 'US', 'JP', 'JP', 'UK'],
    [1, 3, 5, 1, 3, 2]
]
columns = pd.MultiIndex.from_arrays(arrays, names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 6), columns=columns)

# =============================================
# 2. Basic Grouping Operations
# =============================================

# Group columns using dictionary
mapping = {
    'a': 'red', 'b': 'red', 'c': 'blue',
    'd': 'blue', 'e': 'red', 'f': 'green'
}
by_column = people.groupby(mapping, axis=1)

# Group rows by index length
by_name_length = people.groupby(len)

# Group by multiple columns in tipping data
by_day_smoker = tips.groupby(['day', 'smoker'])

# =============================================
# 3. Aggregation Examples
# =============================================

# Basic aggregations
print("Sum by color groups:")
print(by_column.sum())

print("\nMean by name length:")
print(by_name_length.mean())

# Multiple aggregations
print("\nMultiple aggregations by day/smoker:")
print(by_day_smoker['tip_pct'].agg(['mean', 'std', 'count']))

# Custom aggregation function
def data_range(arr):
    return arr.max() - arr.min()

print("\nCustom range aggregation:")
print(by_column.agg(data_range))

# Different aggregations per column
print("\nColumn-specific aggregations:")
print(tips.groupby('sex').agg({
    'total_bill': ['sum', 'mean'],
    'tip': ['max', 'min'],
    'tip_pct': 'std'
}))

# =============================================
# 4. Advanced Grouping Operations
# =============================================

# Grouping with functions
print("\nGroup by first letter of index:")
print(people.groupby(lambda x: x[0]).mean())

# Grouping with multiple keys
keys = ['group1', 'group1', 'group1', 'group2', 'group2', 'group1', 'group2', 'group2']
print("\nGroup by name length and custom keys:")
print(people.groupby([len, keys]).sum())

# Grouping by index level
print("\nGroup by country level:")
print(hier_df.groupby(level='cty', axis=1).mean())

# =============================================
# 5. Apply and Transform Examples
# =============================================

# Top N values per group
def top_n(df, n=3, column='a'):
    return df.sort_values(by=column, ascending=False).head(n)

print("\nTop 3 values per group:")
print(people.groupby(len).apply(top_n))

# Normalize within groups
def zscore(x):
    return (x - x.mean()) / x.std()

print("\nZ-scores within groups:")
print(people.groupby(len).transform(zscore))

# Filter groups
def filter_func(x):
    return x['a'].mean() > 0

print("\nGroups with mean of 'a' > 0:")
print(people.groupby(len).filter(filter_func))

# =============================================
# 6. Additional Examples
# =============================================

# Time-based grouping (if datetime index)
dates = pd.date_range('2023-01-01', periods=8)
people_time = people.copy()
people_time.index = dates
print("\nMonthly resampling:")
print(people_time.resample('M').mean())

# Expanding window operations
print("\nExpanding mean per column:")
print(people.expanding().mean())

# Rolling window operations
print("\nRolling 3-row mean:")
print(people.rolling(3).mean())

# =============================================
# 7. Output All Results to CSV
# =============================================

# Save results to CSV files
by_column.sum().to_csv('groupby_sum.csv')
by_name_length.mean().to_csv('groupby_mean.csv')
tips.groupby(['day', 'smoker']).mean().to_csv('tips_grouped.csv')

print("\nAll operations completed. Results saved to CSV files.")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns  # Optional for enhanced aesthetics

# --- Sample Data Preparation ---
# Create synthetic datasets to mimic examples
np.random.seed(12345)

# Synthetic 'tips' dataset
tips = pd.DataFrame({
    'total_bill': np.random.uniform(10, 50, 244),
    'tip': np.random.uniform(1, 10, 244),
    'smoker': np.random.choice(['Yes', 'No'], 244),
    'day': np.random.choice(['Sun', 'Sat', 'Thur', 'Fri'], 244),
    'time': np.random.choice(['Lunch', 'Dinner'], 244),
    'size': np.random.randint(1, 6, 244)
})
tips['tip_pct'] = tips['tip'] / tips['total_bill']

# Synthetic 'movies' dataset
movies = pd.DataFrame({
    'movie_id': range(1, 11),
    'title': [f"Movie {i}" for i in range(1, 11)],
    'genres': [
        'Animation|Children|Comedy', 'Adventure|Children|Fantasy',
        'Comedy|Romance', 'Comedy|Drama', 'Comedy',
        'Action|Crime|Thriller', 'Comedy|Romance',
        'Adventure|Children', 'Action', 'Action|Adventure|Thriller'
    ]
})

# --- GroupBy Mechanics ---
# Group by single column and compute mean
grouped_single = tips['tip_pct'].groupby(tips['day'])
print("Grouped by 'day' (mean):\n", grouped_single.mean())

# Group by multiple columns and compute mean
grouped_multiple = tips.groupby(['time', 'smoker'])['tip_pct'].mean()
print("\nGrouped by 'time' and 'smoker' (mean):\n", grouped_multiple)

# Iterate over groups
print("\nIterating over groups:")
for (time, smoker), group in tips.groupby(['time', 'smoker']):
    print(f"Time: {time}, Smoker: {smoker}")
    print(group.head())

# --- Pivot Tables ---
# Basic pivot table with mean aggregation
pivot_table_basic = tips.pivot_table(
    values='tip_pct', 
    index='day', 
    columns='smoker', 
    aggfunc='mean'
)
print("\nBasic Pivot Table:\n", pivot_table_basic)

# Pivot table with margins (partial totals)
pivot_table_margins = tips.pivot_table(
    values='tip_pct', 
    index='day', 
    columns='smoker', 
    aggfunc='mean', 
    margins=True
)
print("\nPivot Table with Margins:\n", pivot_table_margins)

# --- Cross-Tabulation ---
# Cross-tabulation of 'day' and 'smoker'
cross_tab = pd.crosstab(tips['day'], tips['smoker'], margins=True)
print("\nCross-Tabulation:\n", cross_tab)

# Cross-tabulation with multiple groupings
cross_tab_multi = pd.crosstab([tips['time'], tips['day']], tips['smoker'], margins=True)
print("\nCross-Tabulation with Time/Day:\n", cross_tab_multi)

# --- Data Transformation ---
# Create indicator/dummy variables for genres
all_genres = set('|'.join(movies['genres']).split('|'))
dummies = pd.DataFrame(0, index=movies.index, columns=sorted(all_genres))
for i, row in movies.iterrows():
    genres = row['genres'].split('|')
    dummies.loc[i, genres] = 1
movies_with_dummies = pd.concat([movies, dummies], axis=1)
print("\nMovies with Dummy Variables:\n", movies_with_dummies.head())

# Discretization with pd.cut
values = np.random.randn(1000)
cats = pd.cut(values, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print("\nDiscretized Data (pd.cut):\n", pd.value_counts(cats))

# Detect and filter outliers
data = pd.DataFrame(np.random.randn(1000, 4))
outliers = data[(np.abs(data) > 3).any(axis=1)]
print("\nOutliers Detected:\n", outliers.head())

# --- Merging Data ---
# Many-to-one merge example
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
merged = pd.merge(df1, df2, on='key')
print("\nMerged DataFrame (Many-to-One):\n", merged)

# Merge with suffixes for overlapping columns
df3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data': np.random.rand(7)})
df4 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data': np.random.rand(3)})
merged_suffixes = pd.merge(df3, df4, on='key', suffixes=('_left', '_right'))
print("\nMerged with Suffixes:\n", merged_suffixes)

# --- Visualization (Optional) ---
try:
    # Plot distributions of tip_pct
    sns.histplot(tips['tip_pct'], kde=True, bins=30)
    plt.title("Distribution of Tip Percentage")
    plt.show()

    # Pair plot of transformed macro data
    macro = pd.DataFrame({
        'cpi': np.random.randn(201),
        'm1': np.random.randn(201),
        'tbilrate': np.random.randn(201),
        'unemp': np.random.randn(201)
    })
    sns.pairplot(macro, diag_kind='kde')
    plt.suptitle("Pair Plot of Economic Indicators", y=1.02)
    plt.show()
except NameError:
    print("Seaborn not imported; skipping visualization examples.")

"""
Key Takeaways:
1. **GroupBy Mechanics**:
   - Split data using `groupby`, apply aggregations (mean, sum, etc.), and combine results.
   - Iterate over groups for custom processing.
   - Use hierarchical indexing for multi-level groups.

2. **Pivot Tables**:
   - `pivot_table` simplifies multi-dimensional aggregation.
   - Use `margins=True` to include partial and grand totals.

3. **Cross-Tabulation**:
   - `pd.crosstab` computes frequency tables for categorical variables.
   - Combine multiple columns for faceted analysis.

4. **Data Transformation**:
   - Create dummy variables for categorical data using loops and `pd.DataFrame`.
   - Use `pd.cut`/`pd.qcut` for discretization.
   - Detect outliers with boolean indexing and `np.abs`.

5. **Merging Data**:
   - `pd.merge` handles relational joins (many-to-one, many-to-many).
   - Use `suffixes` to resolve column name conflicts.

6. **Best Practices**:
   - Use `aggfunc` to specify custom aggregation logic.
   - Handle missing data explicitly during transformations.
   - Leverage vectorized operations for efficiency.
"""

In [None]:
import pandas as pd

# Load the tipping dataset
tips = pd.read_csv('examples/tips.csv')

# Add tip percentage column
tips['tip_pct'] = tips['tip'] / tips['total_bill']

# --- Pivot Tables ---
# Basic pivot table: group means arranged by day and smoker
pivot_basic = tips.pivot_table(index=['day', 'smoker'])
print("Basic Pivot Table:\n", pivot_basic)

# Pivot table with selected columns and multiple groupings
pivot_custom = tips.pivot_table(
    ['tip_pct', 'size'], 
    index=['time', 'day'], 
    columns='smoker'
)
print("\nPivot Table with Selected Columns and Multiple Groupings:\n", pivot_custom)

# Pivot table with margins (partial totals)
pivot_margins = tips.pivot_table(
    ['tip_pct', 'size'], 
    index=['time', 'day'], 
    columns='smoker', 
    margins=True
)
print("\nPivot Table with Margins:\n", pivot_margins)

# Pivot table with a different aggregation function (e.g., count or len)
pivot_count = tips.pivot_table(
    'tip_pct', 
    index=['time', 'smoker'], 
    columns='day', 
    aggfunc=len, 
    margins=True
)
print("\nPivot Table with Count Aggregation:\n", pivot_count)

# Pivot table with fill_value to handle missing values
pivot_fill_value = tips.pivot_table(
    'tip_pct', 
    index=['time', 'size', 'smoker'], 
    columns='day', 
    aggfunc='mean', 
    fill_value=0
)
print("\nPivot Table with Fill Value:\n", pivot_fill_value)

# --- Cross-Tabulations ---
# Example survey data
data = pd.DataFrame({
    'Sample': range(1, 11),
    'Nationality': ['USA', 'Japan', 'USA', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'USA'],
    'Handedness': ['Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 
                   'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 
                   'Right-handed', 'Right-handed']
})

# Cross-tabulation of Nationality and Handedness
crosstab_basic = pd.crosstab(data['Nationality'], data['Handedness'], margins=True)
print("\nBasic Cross-Tabulation:\n", crosstab_basic)

# Cross-tabulation with multiple groupings
crosstab_multi = pd.crosstab([tips['time'], tips['day']], tips['smoker'], margins=True)
print("\nCross-Tabulation with Multiple Groupings:\n", crosstab_multi)

CHAPTER 11 - Time Series

In [None]:
# 🕒 **Time Series Basics with Python & pandas** 🕒

import pandas as pd
from datetime import datetime, timedelta
from dateutil.parser import parse  # For flexible date parsing

# --- 1. Working with datetime Objects ---
# Get current date and time
now = datetime.now()
print("Current Date/Time:", now)  # e.g., 2023-10-05 14:30:45.123456

# Extract components (year, month, day)
print("Year:", now.year)
print("Month:", now.month)
print("Day:", now.day)

# Calculate time difference (timedelta)
start_date = datetime(2023, 1, 1)
end_date = start_date + timedelta(days=10, hours=5)
print("10 Days Later:", end_date)  # 2023-01-11 05:00:00

# --- 2. Converting Between Strings and Dates ---
# Parse a string to datetime
date_str = "2023-10-05"
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
print("Parsed Date:", date_obj)  # 2023-10-05 00:00:00

# Convert datetime to string
formatted_date = date_obj.strftime("%A, %B %d, %Y")
print("Formatted Date:", formatted_date)  # Thursday, October 05, 2023

# Use dateutil.parser for convenience
flexible_date = parse("6/12/2011", dayfirst=True)  # Parses as December 6, 2011
print("Parsed with dateutil:", flexible_date)

# --- 3. pandas Timestamps and Time Series ---
# Convert strings to pandas datetime
dates = ["2023-01-01", "2023-01-02", "2023-01-03", None]
ts = pd.to_datetime(dates)
print("pandas DatetimeIndex:\n", ts)
print("Missing Date (NaT):", ts[3])  # NaT = "Not a Time"

# Check for missing dates
print("Is NaT?", pd.isnull(ts[3]))  # True

# --- 4. Time Series Operations ---
# Create a time series DataFrame
df = pd.DataFrame({
    "date": pd.date_range(start="2023-01-01", periods=5, freq="D"),
    "value": [10, 20, 30, 40, 50]
})
df.set_index("date", inplace=True)
print("\nTime Series DataFrame:\n", df)

# Resample data (e.g., monthly average)
monthly_avg = df.resample("M").mean()
print("\nMonthly Average:\n", monthly_avg)

# --- 5. Handling Time Zones (Optional) ---
# Localize to a time zone
df_utc = df.tz_localize("UTC")
print("\nUTC Time Series:\n", df_utc)

# Convert to another time zone
df_eastern = df_utc.tz_convert("US/Eastern")
print("\nUS/Eastern Time Series:\n", df_eastern)

# --- 6. Date Ranges ---
# Generate a range of dates
date_range = pd.date_range(start="2023-01-01", end="2023-01-10", freq="2D")
print("\nDate Range (every 2 days):\n", date_range)

"""
✨ **Key Takeaways** ✨
1. **datetime Module**: 
   - Use `datetime.now()` for current timestamps.
   - `timedelta` handles time differences (e.g., add days/hours).

2. **String Conversion**:
   - `strptime` parses strings to dates (with format codes).
   - `strftime` formats dates into strings.
   - `dateutil.parser.parse` simplifies parsing ambiguous dates.

3. **pandas Time Series**:
   - `pd.to_datetime()` converts lists/columns to datetime.
   - `NaT` represents missing timestamp data.
   - `resample()` aggregates time series (e.g., daily → monthly).

4. **Time Zones**:
   - Use `tz_localize()` and `tz_convert()` for time zone handling.

5. **Common Pitfalls**:
   - `dateutil.parser` may misinterpret strings like '42' as years.
   - Always specify `dayfirst=True` for international dates (e.g., '6/12/2011' → Dec 6).
"""

In [None]:
# 📅 **Time Series Basics in pandas** 📅
import pandas as pd
import numpy as np
from datetime import datetime

# --- Create a Time Series ---
# Generate dates using pandas date_range
dates = pd.date_range(start='2023-01-01', periods=6, freq='D')  # 6 daily dates starting Jan 1, 2023

# Create a time series (Series with DatetimeIndex)
ts = pd.Series(np.random.randn(6), index=dates)
print("Time Series Example:")
print(ts)

# --- Indexing and Selection ---
# Access data using date strings
print("\nValue on 2023-01-03:")
print(ts['2023-01-03'])  # Equivalent to ts[dates[2]]

# Slice by year/month (works for longer time series)
long_ts = pd.Series(np.random.randn(365), index=pd.date_range('2023-01-01', periods=365))
print("\nJanuary 2023 Data:")
print(long_ts['2023-01'])

# Slice between dates (even if exact dates don't exist!)
print("\nData from Jan 3 to Jan 5, 2023:")
print(ts['2023-01-03':'2023-01-05'])

# --- Truncate Time Series ---
# Keep data before/after a specific date
truncated_ts = ts.truncate(before='2023-01-03', after='2023-01-05')
print("\nTruncated Time Series:")
print(truncated_ts)

# --- Time Series in DataFrames ---
# Create a DataFrame with daily dates
df = pd.DataFrame({
    'Value': np.random.rand(5),
    'Category': ['A', 'B', 'A', 'B', 'A']
}, index=pd.date_range('2023-01-01', periods=5))

print("\nTime Series DataFrame:")
print(df)

# Slice DataFrame by year/month
print("\nJanuary 2023 DataFrame Slice:")
print(df.loc['2023-01'])

# --- Time-Based Operations ---
# Shift data (e.g., lag by 1 day)
shifted_ts = ts.shift(1)
print("\nShifted Time Series (1-day lag):")
print(shifted_ts)

# Calculate percentage change
pct_change = ts.pct_change()
print("\nDaily Percentage Change:")
print(pct_change)

# --- Handling Time Zones (Optional) ---
# Convert to a time zone-aware index
ts_utc = ts.tz_localize('UTC')
print("\nUTC Time Series:")
print(ts_utc)

# Convert to another time zone
ts_eastern = ts_utc.tz_convert('US/Eastern')
print("\nUS/Eastern Time Series:")
print(ts_eastern)

In [None]:
# 📅 **Time Series with Duplicate Indices & Date Ranges** 📅
import pandas as pd
import numpy as np

# --- Time Series with Duplicate Indices ---
# Create a time series with duplicate dates
dates = pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-02', '2023-01-02', '2023-01-03'])
dup_ts = pd.Series([10, 20, 30, 40, 50], index=dates)
print("Time Series with Duplicates:")
print(dup_ts)

# Check if the index is unique
print("\nIs the index unique?", dup_ts.index.is_unique)  # False

# Access data (returns scalar if unique, slice if duplicated)
print("\nData for 2023-01-03 (unique):")
print(dup_ts['2023-01-03'])  # 50
print("\nData for 2023-01-02 (duplicated):")
print(dup_ts['2023-01-02'])  # Shows all entries for that date

# Aggregate duplicates (e.g., mean, count)
grouped = dup_ts.groupby(level=0)
print("\nMean of Duplicates:")
print(grouped.mean())
print("\nCount of Duplicates:")
print(grouped.count())

# --- Generating Date Ranges ---
# Daily frequency between two dates
date_range_daily = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')
print("\nDaily Date Range:")
print(date_range_daily)

# Business month end dates for 2023
date_range_monthly = pd.date_range(start='2023-01-01', end='2023-12-31', freq='BM')
print("\nBusiness Month Ends:")
print(date_range_monthly)

# Generate 5 dates starting from a specific datetime (with time)
date_with_time = pd.date_range(start='2023-01-01 12:30:00', periods=5, freq='D')
print("\nDate Range with Time:")
print(date_with_time)

# Normalize times to midnight
date_normalized = pd.date_range(start='2023-01-01 12:30:00', periods=5, normalize=True)
print("\nNormalized Date Range (Midnight):")
print(date_normalized)

# --- Shifting and Frequency Conversion ---
# Convert irregular time series to fixed daily frequency
irregular_dates = pd.to_datetime(['2023-01-01', '2023-01-03', '2023-01-05'])
ts = pd.Series([1.1, 2.2, 3.3], index=irregular_dates)
print("\nOriginal Irregular Time Series:")
print(ts)

# Resample to daily frequency (introduces NaNs for missing dates)
resampled_ts = ts.resample('D').mean()
print("\nResampled to Daily Frequency:")
print(resampled_ts)

# Shift data by 1 day (forward fill NaNs)
shifted_ts = resampled_ts.shift(1, fill_value=0)
print("\nShifted Time Series (1 Day):")
print(shifted_ts)

In [None]:
# 📅 **Working with Date Frequencies and Offsets** 📅
import pandas as pd
import numpy as np
from datetime import datetime
from pandas.tseries.offsets import Day, MonthEnd, Hour, Minute

# --- 1. Date Ranges with Frequencies ---
# Daily frequency (default)
daily_dates = pd.date_range(start='2023-01-01', periods=5, freq='D')
print("Daily Dates:\n", daily_dates)

# 4-hourly frequency
hourly_dates = pd.date_range(start='2023-01-01', periods=5, freq='4H')
print("\n4-Hourly Dates:\n", hourly_dates)

# Custom: Third Friday of each month (WOM-3FRI)
custom_dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='WOM-3FRI')
print("\nThird Fridays of Each Month:\n", custom_dates)

# --- 2. Shifting Data ---
# Create a monthly time series
ts = pd.Series(np.random.rand(4), index=pd.date_range('2023-01-01', periods=4, freq='M'))
print("\nOriginal Time Series:\n", ts)

# Shift data by 1 period (introduces NaNs)
shifted = ts.shift(1)
print("\nShifted by 1 Period:\n", shifted)

# Shift dates by 1 month (preserves frequency)
shifted_dates = ts.shift(1, freq='M')
print("\nShifted Dates by 1 Month:\n", shifted_dates)

# --- 3. Custom Frequencies ---
# Combine Hour and Minute offsets
custom_freq = Hour(2) + Minute(30)  # Equivalent to '2h30min'
print("\nCustom Frequency (2h30min):\n", custom_freq)

# Generate dates with 90-minute frequency
date_range_90min = pd.date_range('2023-01-01', periods=5, freq='90T')
print("\n90-Minute Date Range:\n", date_range_90min)

# --- 4. Anchored Offsets ---
# Roll to end of month
now = datetime(2023, 11, 17)
end_of_month = now + MonthEnd()
print("\nRolled to End of Month:\n", end_of_month)

# Roll back to start of month
start_of_month = now - MonthEnd()
print("\nRolled Back to Start of Month:\n", start_of_month)

# --- 5. Resampling vs. GroupBy ---
# Create irregular time series
ts = pd.Series(np.random.randn(20), index=pd.date_range('2023-01-15', periods=20, freq='4D'))
print("\nOriginal Irregular Time Series:\n", ts.head())

# Group by rolled dates (old method)
grouped = ts.groupby(MonthEnd().rollforward).mean()
print("\nGrouped by Month End (Old Method):\n", grouped)

# Resample to monthly frequency (preferred)
resampled = ts.resample('M').mean()
print("\nResampled to Monthly (Preferred):\n", resampled)

In [None]:
# 🌍 **Time Zone Handling in pandas** 🌍
import pandas as pd
import numpy as np

# --- Create a Time Series ---
# Generate naive datetime index (no time zone)
dates = pd.date_range('2023-03-12 02:00', periods=3, freq='H')
ts = pd.Series(np.random.rand(3), index=dates)
print("Naive Time Series (No Time Zone):")
print(ts)

# --- Localize to a Time Zone ---
# Add time zone awareness (e.g., US/Eastern)
ts_eastern = ts.tz_localize('US/Eastern')
print("\nLocalized to US/Eastern:")
print(ts_eastern)

# --- Convert to Another Time Zone ---
# Convert to UTC (Coordinated Universal Time)
ts_utc = ts_eastern.tz_convert('UTC')
print("\nConverted to UTC:")
print(ts_utc)

# Convert to Europe/Berlin time
ts_berlin = ts_eastern.tz_convert('Europe/Berlin')
print("\nConverted to Berlin Time:")
print(ts_berlin)

# --- Handle Daylight Saving Time (DST) Transitions ---
# Ambiguous time example (fall DST transition)
ambiguous_dates = pd.date_range('2023-11-05 01:00', periods=3, freq='30T', tz='US/Eastern')
print("\nAmbiguous Times During DST Transition:")
print(ambiguous_dates)

# Handle ambiguous times with 'NaT' (Not a Time)
try:
    pd.Timestamp('2023-11-05 01:30', tz='US/Eastern')
except pytz.AmbiguousTimeError:
    print("\nAmbiguousTimeError: Use 'ambiguous='NaT' to handle")

# --- Operations with Time Zone-Aware Data ---
# Combine two time zone-aware series (result is UTC)
ts1 = pd.Series([1], index=pd.date_range('2023-01-01', periods=1, tz='US/Eastern'))
ts2 = pd.Series([2], index=pd.date_range('2023-01-01', periods=1, tz='Europe/London'))
combined = ts1 + ts2
print("\nCombined Time Series (UTC):")
print(combined)

In [None]:
# 📅 **Timestamps ↔ Periods Conversion** 📅
import pandas as pd
import numpy as np

# --- 1. Convert Timestamps to Periods ---
# Create a monthly timestamp index
dates = pd.date_range('2023-01-01', periods=3, freq='M')
ts = pd.Series([1, 2, 3], index=dates)
print("Original Timestamp Series:")
print(ts)

# Convert to PeriodIndex (monthly periods)
periods = ts.to_period()
print("\nConverted to Periods:")
print(periods)

# --- 2. Convert Periods Back to Timestamps ---
# Convert PeriodIndex to timestamps (default: start of period)
timestamps = periods.to_timestamp()
print("\nConverted Back to Timestamps (Start):")
print(timestamps)

# Convert to end-of-period timestamps
timestamps_end = periods.to_timestamp(how='end')
print("\nConverted to End-of-Period Timestamps:")
print(timestamps_end)

# --- 3. Create PeriodIndex from Arrays ---
# Synthetic data with year and quarter columns
data = pd.DataFrame({
    'year': [2021, 2021, 2022],
    'quarter': [1, 3, 4],
    'value': [100, 200, 300]
})

# Create quarterly PeriodIndex (Q-DEC = quarters ending in December)
period_index = pd.PeriodIndex(year=data['year'], quarter=data['quarter'], freq='Q-DEC')
data.set_index(period_index, inplace=True)
print("\nDataFrame with Quarterly PeriodIndex:")
print(data)

In [None]:
# 🔄 **Resampling Time Series Made Simple** 🔄
import pandas as pd
import numpy as np

# --- 1. Create Sample Data ---
# Daily time series (Jan 1-12, 2023)
dates = pd.date_range('2023-01-01', periods=12, freq='D')
ts = pd.Series(np.random.rand(12), index=dates)
print("Daily Data:")
print(ts.head())

# --- 2. Downsampling (Daily → Monthly) ---
# Convert daily data to monthly averages
monthly_avg = ts.resample('M').mean()
print("\nMonthly Averages:")
print(monthly_avg)

# --- 3. Custom Resampling (Minute Data) ---
# Minute-level data (12 minutes starting at 00:00)
minute_dates = pd.date_range('2023-01-01', periods=12, freq='T')
minute_ts = pd.Series(np.arange(12), index=minute_dates)
print("\nMinute-Level Data:")
print(minute_ts.head())

# Resample to 5-minute bins (closed on right, labeled on right)
resampled = minute_ts.resample('5T', closed='right', label='right').sum()
print("\n5-Minute Sum (Right-Closed):")
print(resampled)

# Adjust labels by shifting 1 second earlier
resampled_shifted = minute_ts.resample('5T', closed='right', label='right', loffset='-1s').sum()
print("\n5-Minute Sum (Adjusted Labels):")
print(resampled_shifted)

# --- 4. OHLC Aggregation (Finance Use Case) ---
# Compute Open, High, Low, Close for 5-minute intervals
ohlc = minute_ts.resample('5T').ohlc()
print("\nOHLC Resampling (5-Minute):")
print(ohlc)

# --- 5. Upsampling (Daily → Hourly) ---
# Convert daily data to hourly with forward fill
upsampled = ts.resample('H').ffill()
print("\nUpsampled to Hourly (Forward-Filled):")
print(upsampled.head())

# --- 6. Period Index Resampling ---
# Resample to monthly periods (instead of timestamps)
period_monthly = ts.resample('M', kind='period').sum()
print("\nMonthly Period Resampling:")
print(period_monthly)

In [None]:
# 🔼 **Upsampling Time Series Data** 🔼
import pandas as pd
import numpy as np

# --- 1. Create Weekly Data ---
dates = pd.date_range('2023-01-01', periods=2, freq='W-MON')  # Weekly on Mondays
frame = pd.DataFrame({
    'Sales': [100, 200],
    'Expenses': [50, 75]
}, index=dates)
print("Original Weekly Data:")
print(frame)

# --- 2. Upsample to Daily (No Aggregation) ---
# Convert to daily frequency (introduces NaNs)
daily_na = frame.resample('D').asfreq()
print("\nUpsampled to Daily (NaNs):")
print(daily_na.head())

# --- 3. Forward Fill (ffill) for Interpolation ---
# Fill missing values with previous week's data
daily_ffill = frame.resample('D').ffill()
print("\nForward-Filled Daily Data:")
print(daily_ffill.head())

# Limit forward fill to 2 days
daily_ffill_limited = frame.resample('D').ffill(limit=2)
print("\nForward Fill (Limited to 2 Days):")
print(daily_ffill_limited.head())

# --- 4. Resampling with Periods ---
# Create monthly PeriodIndex data
periods = pd.period_range('2023-01', '2023-12', freq='M')
annual_data = pd.DataFrame({
    'Revenue': np.random.rand(12)
}, index=periods)
print("\nMonthly Period Data:")
print(annual_data.head())

# Resample to quarterly (default: 'start' convention)
quarterly = annual_data.resample('Q').ffill()
print("\nQuarterly Resampled (Start Convention):")
print(quarterly)

# Resample to quarterly (end convention)
quarterly_end = annual_data.resample('Q', convention='end').ffill()
print("\nQuarterly Resampled (End Convention):")
print(quarterly_end)

In [None]:
# 📈 **Moving Window Functions for Time Series** 📈
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Create Synthetic Stock Data ---
# Generate daily dates for 2023
dates = pd.date_range('2023-01-01', periods=250, freq='B')
prices = pd.DataFrame({
    'AAPL': np.random.normal(150, 20, 250).cumsum() + 150,
    'SPX': np.random.normal(4000, 100, 250).cumsum() + 4000
}, index=dates)

# --- 2. Rolling Window Calculations ---
# Compute 30-day moving average (rolling mean)
rolling_mean = prices['AAPL'].rolling(30).mean()
print("\n30-Day Rolling Mean (First 5):\n", rolling_mean.head())

# Compute 30-day rolling standard deviation
rolling_std = prices['AAPL'].rolling(30, min_periods=10).std()
print("\n30-Day Rolling Std (First 5):\n", rolling_std.head())

# --- 3. Exponentially Weighted Moving Average (EWMA) ---
# EWMA with span=30 (reacts faster to recent changes)
ewma = prices['AAPL'].ewm(span=30).mean()
print("\nEWMA (First 5):\n", ewma.head())

# --- 4. Plotting Moving Averages ---
plt.figure(figsize=(10, 5))
prices['AAPL'].plot(label='Daily Prices', alpha=0.5)
rolling_mean.plot(label='30-Day MA', style='k--')
ewma.plot(label='30-Day EWMA', style='r-')
plt.title("Apple Stock Price vs. Moving Averages")
plt.legend()
plt.show()

# --- 5. Binary Rolling Functions (Correlation) ---
# Compute percentage returns
returns = prices.pct_change()

# Rolling 60-day correlation between AAPL and SPX
rolling_corr = returns['AAPL'].rolling(60).corr(returns['SPX'])
print("\n60-Day Rolling Correlation (First 5):\n", rolling_corr.head())

# Plot the rolling correlation
plt.figure(figsize=(10, 3))
rolling_corr.plot(title="Rolling 60-Day Correlation: AAPL vs. SPX")
plt.axhline(0, color='black', linestyle='--')
plt.show()

# --- 6. Expanding Window (Cumulative Mean) ---
# Compute cumulative mean from start to current date
expanding_mean = prices['AAPL'].expanding().mean()
print("\nExpanding Mean (First 5):\n", expanding_mean.head())

In [None]:
# 📊 **Pandas Time Series & GroupBy Essentials** 📊
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pandas.tseries.offsets import MonthEnd
from scipy.stats import percentileofscore  # For custom rolling function

# --- 1. GroupBy Mechanics ---
# Create a sample DataFrame
df = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Values': np.random.rand(6),
    'Weights': np.random.randint(1, 10, 6)
})

# Group by 'Category' and compute mean
grouped = df.groupby('Category')
print("GroupBy Mean:\n", grouped['Values'].mean())

# Multiple aggregations
print("\nGroupBy Aggregations:\n", grouped.agg(['mean', 'sum']))

# Custom function: Weighted average
def weighted_avg(group):
    return (group['Values'] * group['Weights']).sum() / group['Weights'].sum()

print("\nWeighted Average by Category:\n", grouped.apply(weighted_avg))

# --- 2. Pivot Tables & Cross-Tabulation ---
# Create a synthetic dataset
data = pd.DataFrame({
    'Movie': ['Movie 1', 'Movie 2', 'Movie 3'],
    'Genres': ['Action|Adventure', 'Comedy|Romance', 'Action|Comedy'],
    'Rating': [8.5, 7.2, 6.9]
})

# Cross-tabulation of genres
cross_tab = pd.crosstab(data['Genres'], data['Rating'])
print("\nCross-Tabulation (Genres vs Ratings):\n", cross_tab)

# Pivot table for multi-level aggregation
pivot = data.pivot_table(index='Genres', values='Rating', aggfunc=['mean', 'max'])
print("\nPivot Table (Genres Aggregation):\n", pivot)

# --- 3. Time Series Basics ---
# Generate daily dates
dates = pd.date_range('2023-01-01', periods=5, freq='D')
ts = pd.Series(np.random.rand(5), index=dates)
print("\nTime Series with Daily Index:\n", ts)

# Convert to monthly periods
periods = ts.to_period('M')
print("\nConverted to Monthly Periods:\n", periods)

# --- 4. Time Zone Handling ---
# Localize to UTC and convert to New York time
ts_utc = ts.tz_localize('UTC')
ts_ny = ts_utc.tz_convert('America/New_York')
print("\nTime Zone Conversion (UTC → NY):\n", ts_ny)

# --- 5. Resampling ---
# Resample daily data to monthly mean
monthly = ts.resample('M').mean()
print("\nMonthly Resampled Data:\n", monthly)

# Upsample with forward fill
upsampled = ts.resample('12H').ffill()
print("\nUpsampled to 12-Hourly (Forward-Filled):\n", upsampled.head())

# --- 6. Moving Window Functions ---
# Rolling 3-day mean
rolling_mean = ts.rolling(3).mean()
print("\n3-Day Rolling Mean:\n", rolling_mean)

# Exponentially weighted moving average (EWMA)
ewma = ts.ewm(span=3).mean()
print("\n3-Day EWMA:\n", ewma)

# Custom rolling function: Percentile of score
returns = pd.Series(np.random.normal(0, 0.1, 100), index=pd.date_range('2023-01-01', periods=100, freq='B'))
percentile_rank = returns.rolling(30).apply(lambda x: percentileofscore(x, 0.02))
print("\n30-Day Rolling Percentile Rank (Sample):\n", percentile_rank.head())

# Plot rolling statistics
plt.figure(figsize=(10, 4))
ts.plot(label='Daily Data')
rolling_mean.plot(label='3-Day MA')
ewma.plot(label='3-Day EWMA')
plt.title("Moving Averages vs. Raw Data")
plt.legend()
plt.show()

CHAPTER 12 - Advanced pandas