## Exercise 8: Wine

In [2]:
import numpy as np

# 1. Load the dataset
data = np.genfromtxt("winequality-red.csv", 
                     delimiter=";", 
                     skip_header=1,   # exclude header
                     dtype=np.float32)

print("Data shape:", data.shape)  
print("Array size in bytes:", data.nbytes)  # should be 76800


Data shape: (1599, 12)
Array size in bytes: 76752


In [3]:
# Display the 2nd, 7th, and 12th rows
rows = data[[1, 6, 11], :]   # Python indexing starts at 0
rows = rows[~np.isnan(rows)] # remove NaN if any
print("Selected rows:\n", rows)


Selected rows:
 [7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
 9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00 7.900e+00 6.000e-01
 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01 9.964e-01 3.300e+00
 4.600e-01 9.400e+00 5.000e+00 7.500e+00 5.000e-01 3.600e-01 6.100e+00
 7.100e-02 1.700e+01 1.020e+02 9.978e-01 3.350e+00 8.000e-01 1.050e+01
 5.000e+00]


In [4]:
# Any wine with alcohol > 20%
alcohol_column = data[:, -2]   # alcohol is the second last column
print("Any wine with alcohol > 20%:", np.any(alcohol_column > 20))


Any wine with alcohol > 20%: False


In [5]:
# Average alcohol percentage
avg_alcohol = np.nanmean(alcohol_column)  # ignores NaN
print("Average alcohol %:", avg_alcohol)


Average alcohol %: 10.422984


In [6]:
# Statistical measures for pH values
ph_column = data[:, 8]   # pH is the 9th column (index 8)

stats = {
    "min": np.nanmin(ph_column),
    "max": np.nanmax(ph_column),
    "25%": np.nanpercentile(ph_column, 25),
    "50% (median)": np.nanpercentile(ph_column, 50),
    "75%": np.nanpercentile(ph_column, 75),
    "mean": np.nanmean(ph_column)
}
print("pH stats:", stats)


pH stats: {'min': np.float32(2.74), 'max': np.float32(4.01), '25%': np.float32(3.21), '50% (median)': np.float32(3.31), '75%': np.float32(3.4), 'mean': np.float32(3.3111134)}


In [7]:
# Average quality score of wines with lowest 20% sulphates
sulphates = data[:, 9]   # sulphates is the 10th column (index 9)
quality = data[:, -1]    # quality is last column

threshold = np.percentile(sulphates, 20)  # 20th percentile
mask = sulphates < threshold
avg_quality_low_sulphates = np.mean(quality[mask])
print("Avg quality (lowest 20% sulphates):", avg_quality_low_sulphates)


Avg quality (lowest 20% sulphates): 5.1854305


In [9]:
# Mean of all variables for best vs worst quality wines
best_quality = np.max(quality)
worst_quality = np.min(quality)

mean_best = np.mean(data[quality == best_quality], axis=0)
mean_worst = np.mean(data[quality == worst_quality], axis=0)

print("Mean values for best quality wines:\n", mean_best)
print("\n")
print("Mean values for worst quality wines:\n", mean_worst)


Mean values for best quality wines:
 [ 8.566666    0.4233333   0.39111114  2.5777776   0.06844445 13.277778
 33.444443    0.99521226  3.2672222   0.76777774 12.094444    8.        ]


Mean values for worst quality wines:
 [ 8.359999    0.8845      0.17099999  2.6350002   0.12249999 11.
 24.9         0.997464    3.398       0.57000005  9.955       3.        ]
