In [2]:
import numpy as np
iris_data = np.loadtxt('data/irisdata.txt', comments="%")
x = iris_data[0:50, 0:4]

# checking data dimensions
n_feat = x.shape[1]
n_obs = x.shape[0]
print(f"Number of features: {n_feat} and number of observations: {n_obs}")


Number of features: 4 and number of observations: 50


In [3]:
# Exploring the data
# vectors of individual features
sepal_length = x[:, 0]
sepal_width = x[:, 1]
petal_length = x[:, 2]
petal_width = x[:, 3]

#computed variance
var_sepal_length = sepal_length.var(ddof=1)
var_sepal_width = sepal_width.var(ddof=1)
var_petal_length = petal_length.var(ddof=1)
var_petal_width = petal_width.var(ddof=1)

print(f"Variance of sepal length: {var_sepal_length}")
print(f"Variance of sepal width: {var_sepal_width}")
print(f"Variance of petal length: {var_petal_length}")
print(f"Variance of petal width: {var_petal_width}")

Variance of sepal length: 0.12424897959183677
Variance of sepal width: 0.1436897959183674
Variance of petal length: 0.030159183673469384
Variance of petal width: 0.01110612244897959


# Variance vs. Covariance

## 🔹 Variance
- **Definition:** A measure of how much a single variable spreads out around its mean.  
- **Formula (for a variable X):**
\[
\text{Var}(X) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})^2
\]
- **Intuition:**
  - If all values of \(X\) are close to the mean → low variance.
  - If they are spread out → high variance.
- **Example:** For people’s heights, variance tells you how much individual heights differ from the average height.

---

## 🔹 Covariance
- **Definition:** A measure of how two variables change together.  
- **Formula (for variables X and Y):**
\[
\text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y})
\]
- **Intuition:**
  - **Positive covariance:** When \(X\) is above its mean, \(Y\) tends to be above its mean (they move together).
  - **Negative covariance:** When \(X\) is above its mean, \(Y\) tends to be below its mean (they move oppositely).
  - **Covariance ≈ 0:** No consistent relationship.

---

## 🔹 Relationship
- Variance is a **special case of covariance**:
\[
\text{Var}(X) = \text{Cov}(X, X)
\]

---

## 🔹 Visual intuition
- **Variance:** Spread along one axis.  
- **Covariance:** Whether clouds of points tilt together in 2D space.  

Example: Scatterplot of height vs. weight:  
- Taller people tend to weigh more → positive covariance.  
- Taller people tend to weigh less → negative covariance.  
- No relationship → covariance ≈ 0.  

In [7]:
# computed covariance between lepal length and sepal width
cov_sepal_length_width = np.cov(sepal_length, sepal_width, ddof=1)[0, 1]
print(f"Covariance between sepal length and sepal width: {cov_sepal_length_width}")     

#and using $$\sigma^2 = \frac{1}{N-1} \sum_i a_i b_i$$
cov_sepal_length_width_manual = np.sum((sepal_length - sepal_length.mean()) * (sepal_width - sepal_width.mean())) / (n_obs - 1)
print(f"Covariance between sepal length and sepal width (manual calculation): {cov_sepal_length_width_manual}")


Covariance between sepal length and sepal width: 0.09921632653061224
Covariance between sepal length and sepal width (manual calculation): 0.09921632653061223


In [None]:
#graphical exploration using seaborn and pandas
import seaborn as sns
import pandas as pd

# transform data to pandas dataframe
d = pd.DataFrame(x, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
                             
sns.pairplot(d)
plt.show()

NameError: name 'plt' is not defined