# Speed Up Phython #

## Python Loop vs List Comprehension ##

In [None]:
import time

def slow_square(nums):
    result = []
    for n in nums:
        result.append(n*n)
    return result

# Timing
nums = list(range(1_000_0000))

start = time.time()
slow_square(nums)
end = time.time()
total_time1 = end - start
print("Slow loop time:", end - start, "seconds")


Slow loop time: 13.22475814819336 seconds


### Optimized Version: List Comprehension

In [11]:
import time

def fast_square(nums):
    return [n*n for n in nums]

# Timing
start = time.time()
fast_square(nums)
end = time.time()
total_time2 = end - start
print("List comprehension time:", end - start, "seconds")

print("Speedup percentage:", (total_time1-total_time2) *100/ total_time1)

List comprehension time: 6.795704126358032 seconds
Speedup percentage: 48.61377387618694


### Pure Python vs NumPy Vectorization

In [14]:
import time
import numpy as np

nums = np.arange(1_000_0000)

# -------- SLOW --------
def slow_square(nums):
    out = []
    for x in nums:
        out.append(x * x)
    return out

start = time.time()
slow_square(nums)
t1=time.time() - start
print("Pure Python:", time.time() - start, "sec")

# -------- FAST (NumPy) --------
start = time.time()
nums * nums
t2=time.time() - start
print("NumPy vectorized:", time.time() - start, "sec")
print("Speedup percentage:", (t1 - t2) * 100 / t1)


  out.append(x * x)


Pure Python: 8.030144453048706 sec
NumPy vectorized: 0.011869668960571289 sec
Speedup percentage: 99.85218610910461


### Using map() vs traditional loops

In [19]:
import time

nums = list(range(1_000_0000))

# SLOW
start = time.time()
out1 = []
for x in nums:
    out1.append(x + 5)
print("Loop:", time.time() - start)
t1=time.time() - start
# FAST
start = time.time()
list(map(lambda x: x + 5, nums))
t2=time.time() - start
print("map():", time.time() - start)
print("Speedup percentage:", (t1 - t2) * 100 / t1)

Loop: 2.0451040267944336
map(): 0.7907052040100098
Speedup percentage: 61.33667560914306


### Caching Results (functools.lru_cache)

In [None]:
import time
from functools import lru_cache
n1=40
# Expensive Fibonacci
def fib_slow(n):
    if n < 2:
        return n
    return fib_slow(n-1) + fib_slow(n-2)

start = time.time()
fib_slow(n1)  # slow recursive
t1=time.time() - start
print("Slow fib:", time.time() - start)

# FAST (cached)
@lru_cache(None)
def fib_fast(n):
    if n < 2:
        return n
    return fib_fast(n-1) + fib_fast(n-2)

start = time.time()
fib_fast(n1)
t2=time.time() - start
print("Fast fib (cached):", time.time() - start)
print("Speedup percentage:", (t1 - t2) * 100 / t1)

Slow fib: 30.153449296951294
Fast fib (cached): 0.0
Speedup percentage: 100.0


### Loop optimization using local variables

In [3]:
import time

nums = list(range(1_000_0000))

# SLOW: global lookup inside loop
def slow_global():
    out = []
    for x in nums:
        out.append(x * 3)
    return out

# FAST: copy functions/vars into local scope
def fast_local():
    out = []
    append = out.append     # local binding
    for x in nums:
        append(x * 3)
    return out

start = time.time()
slow_global()
t1= time.time() - start
print("Global lookup:", time.time() - start)

start = time.time()
fast_local()
t2= time.time() - start
print("Local variable bind:", time.time() - start)
print("Speedup percentage:", (t1 - t2) * 100 / t1)

Global lookup: 0.8975656032562256
Local variable bind: 0.7576634883880615
Speedup percentage: 15.586840043849875


### Generator vs List (Memory + Speed)

In [6]:
import time
n1=10_000_000
# SLOW: List (builds full memory)
start = time.time()
sum([i for i in range(n1)])
print("List:", time.time() - start)
t1=time.time() - start
# FAST: generator (streaming)
start = time.time()
sum(i for i in range(n1))
print("Generator:", time.time() - start)
t2=time.time() - start
print("Speedup percentage:", (t1 - t2) * 100 / t1)


List: 0.6271848678588867
Generator: 0.5285592079162598
Speedup percentage: 15.725133847587854


### Multiprocessing for CPU-heavy tasks

In [None]:
# import time
# import numpy as np
# from multiprocessing import Pool, cpu_count

# # --- Configuration ---
# # Choose a matrix size that makes the calculation long enough (e.g., 500x500)
# MATRIX_SIZE = 3000
# # Use all available CPU cores for maximum speed-up demonstration
# NUM_PROCESSES = cpu_count()
# print(f"Detected CPU cores (processes to use): {NUM_PROCESSES}")

# # --- The Computation Function ---
# def multiply_matrices(chunk):
#     """
#     Function to be executed by each process.
#     It takes a chunk of rows from matrix A and multiplies it 
#     by the entire matrix B.
#     """
#     A_chunk, B = chunk
#     # NumPy's matmul (@) is highly optimized for this
#     return A_chunk @ B

# # --- Main Execution Block ---
# if __name__ == "__main__":
#     # 1. Data Preparation (NumPy matrices are efficient)
#     print("Preparing data: Generating large matrices...")
#     A = np.random.rand(MATRIX_SIZE, MATRIX_SIZE)
#     B = np.random.rand(MATRIX_SIZE, MATRIX_SIZE)
    
#     # Calculate the number of rows each process should handle
#     rows_per_process = MATRIX_SIZE // NUM_PROCESSES
    
#     # Split Matrix A into chunks (rows)
#     chunks = []
#     for i in range(NUM_PROCESSES):
#         start_row = i * rows_per_process
#         # Handle the remainder in the last chunk
#         end_row = (i + 1) * rows_per_process if i < NUM_PROCESSES - 1 else MATRIX_SIZE
        
#         # Each item in the iterable must be the input for the worker function
#         # We pass the relevant rows of A and the whole matrix B
#         chunks.append((A[start_row:end_row, :], B))

#     # --- 2. Single-Process Execution (Baseline) ---
#     start_single = time.perf_counter()
#     # The multiplication is done directly in the main process
#     C_single = A @ B
#     end_single = time.perf_counter()
#     single_time = end_single - start_single
    
#     print("\n--- Single Process Results ---")
#     print(f"Matrix size: {MATRIX_SIZE}x{MATRIX_SIZE}")
#     print(f"Time Taken: {single_time:.4f} seconds")

#     # --- 3. Multi-Process Execution ---
#     start_multi = time.perf_counter()
    
#     # Use the Pool of worker processes
#     with Pool(processes=NUM_PROCESSES) as pool:
#         # map() distributes the chunks iterable to the workers
#         partial_results = pool.map(multiply_matrices, chunks)
        
#     # Reconstruct the final result matrix from the partial results
#     C_multi = np.concatenate(partial_results, axis=0)
    
#     end_multi = time.perf_counter()
#     multi_time = end_multi - start_multi
    
#     print("\n--- Multi Process Results ---")
#     print(f"Time Taken: {multi_time:.4f} seconds (using {NUM_PROCESSES} processes)")
    
#     # --- 4. Validation and Summary ---
#     # Check if the results are numerically close
#     is_correct = np.allclose(C_single, C_multi)
    
#     speed_up = single_time / multi_time
    
#     print(f"\nResult Validation: {'Passed' if is_correct else 'Failed'}")
#     print("--- Summary ---")
#     print(f"Speed-up Factor: **{speed_up:.2f}x**")
#     print(f"This demonstrates parallel processing overcoming the Python GIL on a CPU-bound task.")

Detected CPU cores (processes to use): 12
Preparing data: Generating large matrices...

--- Single Process Results ---
Matrix size: 3000x3000
Time Taken: 0.3543 seconds


### Numba JIT Compilation (Massive Speedup)

In [1]:
import time
from numba import njit

# SLOW pure python
def slow_compute(n):
    s = 0
    for i in range(n):
        s += i * 2
    return s

start = time.time()
slow_compute(50_000_000)
print("Python:", time.time() - start)
t1=time.time() - start
# FAST with JIT
@njit
def fast_compute(n):
    s = 0
    for i in range(n):
        s += i * 2
    return s

# First call compiles (slow)
fast_compute(1)

start = time.time()
fast_compute(50_000_000)
print("Numba JIT:", time.time() - start)
t2=time.time() - start
print("Speedup percentage:", (t1 - t2) * 100 / t1)


Python: 3.217660427093506
Numba JIT: 0.0
Speedup percentage: 100.0


### Using sets for O(1) lookup

In [4]:
import time

items = list(range(1_000_0000))
lookup = 999_99999

# SLOW: list search O(n)
start = time.time()
found = lookup in items
print("List lookup:", time.time() - start)
t1=time.time() - start
# FAST: set search O(1)
s = set(items)
start = time.time()
found = lookup in s
print("Set lookup:", time.time() - start)
t2=time.time() - start
print("Speedup percentage:", (t1 - t2) * 100 / t1)


List lookup: 0.08551478385925293
Set lookup: 0.0009996891021728516
Speedup percentage: 98.83097511674914


### Using join() instead of + for strings

In [5]:
import time

words = ["hello"] * 300000

# SLOW
start = time.time()
s = ""
for w in words:
    s += w  # string concatenation (expensive!)
print("String + concat:", time.time() - start)
t1=time.time() - start
# FAST
start = time.time()
"".join(words)
print("join():", time.time() - start)
t2=time.time() - start
print("Speedup percentage:", (t1 - t2) * 100 / t1)


String + concat: 0.14478397369384766
join(): 0.002000093460083008
Speedup percentage: 98.61856709064202
