In [6]:
import numpy as np
import pandas as pd

# Keep seed fixed for reproducibility
np.random.seed(42)



In [7]:
# Create ages between 18 and 59 for 10 people
ages = np.random.randint(18, 60, size=10)

# Income grows with age, but add some noise
base_income = 20000 + ages * 1000
noise = np.random.normal(0, 5000, size=10)  # random noise
income = base_income + noise

# Introduce some NaNs into income (simulate missing data)
income[2] = np.nan
income[7] = np.nan

# Build DataFrame
df = pd.DataFrame({
    'age': ages,
    'income': income
})

print(df)


   age        income
0   56  77395.206461
1   46  71052.576424
2   32           NaN
3   25  42374.150964
4   38  55143.099171
5   56  71379.585811
6   36  42937.254937
7   40           NaN
8   28  52082.225405
9   28  40380.620012


In [8]:
# (a) Mean income (ignores NaNs automatically)
mean_income = df['income'].mean()

# (b) Median income
median_income = df['income'].median()

# (c) Age-weighted mean income (only where income is not NaN)
valid = df['income'].notna()
weighted_mean_income = (df.loc[valid, 'income'] * df.loc[valid, 'age']).sum() / df.loc[valid, 'age'].sum()

print("Mean income:", round(mean_income, 2))
print("Median income:", round(median_income, 2))
print("Age-weighted mean income:", round(weighted_mean_income, 2))


Mean income: 56593.09
Median income: 53612.66
Age-weighted mean income: 60349.2


**Problem 2:** Standardize income (z-score). Report how many incomes are outliers using rule |z| 
> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).

In [9]:
import numpy as np
import pandas as pd

# Recreate dataset (same as before, seed ensures consistency)
np.random.seed(42)
ages = np.random.randint(18, 60, size=10)
base_income = 20000 + ages * 1000
noise = np.random.normal(0, 5000, size=10)
income = base_income + noise
income[2] = np.nan
income[7] = np.nan

df = pd.DataFrame({
    'age': ages,
    'income': income
})

# --- Problem 2: Standardization (z-score) ---
mean_income = df['income'].mean(skipna=True)   # ignore NaN
std_income = df['income'].std(skipna=True)     # ignore NaN

# Compute z-score for income (will be NaN where income is NaN)
df['income_z'] = (df['income'] - mean_income) / std_income

# Identify outliers where |z| > 3
outliers = df['income_z'].abs() > 3
n_outliers = outliers.sum()

print("Mean income:", round(mean_income, 2))
print("Std income:", round(std_income, 2))
print("Number of outliers (|z| > 3):", n_outliers)
print("\nOutlier rows:\n", df[outliers])


Mean income: 56593.09
Std income: 14805.4
Number of outliers (|z| > 3): 0

Outlier rows:
 Empty DataFrame
Columns: [age, income, income_z]
Index: []


**Problem 3:**
Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin: 
● count of observations, 
● mean income, 
● median score. 
Show result as a tidy DataFrame sorted by age bin. 

In [11]:
import numpy as np
import pandas as pd

# Recreate dataset with score
np.random.seed(42)
ages = np.random.randint(18, 60, size=15)
base_income = 20000 + ages * 1000
noise = np.random.normal(0, 5000, size=15)
income = base_income + noise
income[2] = np.nan
income[7] = np.nan

# Generate scores (0–100) with some randomness
scores = np.clip(np.random.normal(70, 15, size=15), 0, 100)
scores[4] = np.nan   # insert one NaN for realism

df = pd.DataFrame({
    'age': ages,
    'income': income,
    'score': scores
})

print(df.head(10))


   age        income      score
0   56  73652.628070  48.628777
1   46  68712.800218  61.834259
2   32           NaN  71.663839
3   25  42671.351232  52.735096
4   38  59209.811358        NaN
5   56  66433.598777  60.990420
6   36  47375.410837  65.624594
7   40           NaN  60.974401
8   28  42935.844398  97.784173
9   28  49571.236663  69.797542


In [12]:
# Define bins and labels
bins = [18, 25, 35, 45, 60]
labels = ['18-25', '25-35', '35-45', '45-60']

# Assign each person to an age bin
df['age_bin'] = pd.cut(df['age'], bins=bins, right=False, labels=labels)

# Group by age_bin and calculate:
# - count of observations
# - mean income (ignoring NaNs)
# - median score (ignoring NaNs)
summary = df.groupby('age_bin').agg(
    count_obs = ('age', 'count'),
    mean_income = ('income', 'mean'),
    median_score = ('score', 'median')
).reset_index()

# Round values for readability
summary['mean_income'] = summary['mean_income'].round(2)
summary['median_score'] = summary['median_score'].round(2)

print(summary)


  age_bin  count_obs  mean_income  median_score
0   18-25          1     40337.64         40.60
1   25-35          4     45059.48         70.73
2   35-45          5     55729.06         63.30
3   45-60          5     71813.15         60.99


  summary = df.groupby('age_bin').agg(


In [13]:
import numpy as np

# Create a 3x3 matrix (not 1D!)
A = np.array([[4, 7, 2],
              [3, 5, 6],
              [1, 0, 8]])

print("Matrix A:\n", A)


Matrix A:
 [[4 7 2]
 [3 5 6]
 [1 0 8]]


In [14]:
# Shape and size
print("Shape of A:", A.shape)   # (rows, cols)
print("Size of A:", A.size)     # total elements

# Transpose (swap rows & columns)
print("Transpose of A:\n", A.T)

# Flatten (convert 2D to 1D array)
print("Flattened A:", A.flatten())


Shape of A: (3, 3)
Size of A: 9
Transpose of A:
 [[4 3 1]
 [7 5 0]
 [2 6 8]]
Flattened A: [4 7 2 3 5 6 1 0 8]


In [15]:
# Negative indexing (valid)
print("Last element A[-1, -1]:", A[-1, -1])  # last row, last column

# Invalid slicing (will throw error)
try:
    print("Trying to access A[5]:", A[5])  # row index 5 doesn't exist
except Exception as e:
    print("Error caught:", type(e).__name__, "-", e)


Last element A[-1, -1]: 8
Error caught: IndexError - index 5 is out of bounds for axis 0 with size 3


In [16]:
# Broadcasting (adding a vector to each row)
vec = np.array([10, 20, 30])
print("Broadcasting A + vec:\n", A + vec)

# Dot product (Matrix multiplication)
B = np.array([[2, 1],
              [0, 3],
              [4, 5]])  # 3x2 matrix
dot_result = A.dot(B)
print("Dot Product A·B:\n", dot_result)


Broadcasting A + vec:
 [[14 27 32]
 [13 25 36]
 [11 20 38]]
Dot Product A·B:
 [[16 35]
 [30 48]
 [34 41]]


In [17]:
import numpy.linalg as la

# Determinant
det_A = la.det(A)
print("Determinant of A:", round(det_A, 2))

# Inverse (only if determinant ≠ 0)
if abs(det_A) < 1e-12:
    print("Matrix A is singular (no inverse).")
else:
    inv_A = la.inv(A)
    print("Inverse of A:\n", inv_A)

    # Check: A · A⁻¹ ≈ Identity matrix
    print("A * A⁻¹ (should be Identity):\n", np.round(A.dot(inv_A), 2))


Determinant of A: 24.0
Inverse of A:
 [[ 1.66666667 -2.33333333  1.33333333]
 [-0.75        1.25       -0.75      ]
 [-0.20833333  0.29166667 -0.04166667]]
A * A⁻¹ (should be Identity):
 [[ 1. -0. -0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]
