In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns

In this notebook you will work with astronomical observation data used to classify celestial objects. (Some missing values are given as $-9999$, and those are removed here.)

In [2]:
astro = pd.read_csv("star_classification.csv")
astro.replace(-9999, np.nan, inplace=True)
astro.dropna(inplace=True)
astro.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


## 1.
(3.1) Determine how many instances are in the frame for each different label in the *class* column.

In [None]:
class_counts = None
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
print(class_counts)

In [None]:
# TESTS
assert type(class_counts) == pd.Series, "result must be a series"
assert len(class_counts) == 3, "wrong length"
assert class_counts.sum() == len(astro["class"]), "wrong values"
print("OK")

In [None]:
# Intentionally left blank--do not delete

## 2.
(3.1) Let `X` be a subset frame with the columns named *u*, *g*, *r*, *i*, *z*. Let `y` be the column *class*.

In [3]:
X = None
y = None

features = ["alpha", "delta", "redshift"]

x = astro[features]
y = astro["cam_col"]


In [None]:
X.head(7)

In [None]:
y.head(7)

In [None]:
# TESTS
assert type(X)==pd.DataFrame, "result must be a frame"
assert X.shape[1]==5, "result must have 5 columns"
print("OK")

In [None]:
# Intentionally left blank--do not delete

## 3.
(3.4) Each row of `X` is a 5-dimensional vector. Let `v` be the difference between the second row and the first row of `X`. Find the 1-norm and 2-norm of `v`.

In [9]:
one_norm, two_norm = None, None

v = x.iloc[1]-x.iloc[0] 

v_norm_1 = np.linalg.norm(v, ord=1)
v_norm_2 = np.linalg.norm(v, ord=2)

one_norm = v_norm_1
two_norm = v_norm_2


print(one_norm)
print(two_norm)

10.501783291870794
9.219272414662132


In [10]:
print(f"Vector has 2-norm {two_norm:.5f} and 1-norm {one_norm:.5f}.")

Vector has 2-norm 9.21927 and 1-norm 10.50178.


In [None]:
# TESTS
assert 4 < two_norm < 5, "wrong value"
assert 8 < one_norm < 9, "wrong value"
print("OK")

## 4.
(3.2) Split the data into training set `X_train,y_train` with 80% of the data, and `X_test,y_test` for testing. 

**IMPORTANT**: Make sure the split order is randomized starting from random state 3383.

In [None]:
X_train, y_train, X_test, y_test = None,None,None,None

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True, random_state=5)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
# TESTS
assert np.isclose( X_train["u"].sum(), 1766124 ), "wrong rows in training features"
assert (y_test=="GALAXY").sum() == 11999, "wrong rows in test labels"
print("OK")

## 5.
(3.4) Train a kNN classifier with $k=11$ neighbors on the training set. Find its $F_1$ score on the test set using `"macro"` averaging.

In [None]:
knn_f1 = None

import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
knn = KNeighborsClassifier(n_neighbors=11)
knn_f1 = f1_score(y_test, yhat, average="macro")

In [None]:
print(f"F1 score for knn = {knn_f1:.6f}")

In [None]:
# TESTS
assert 0.77 < knn_f1 < 0.85, "wrong value"
print("OK")

In [None]:
# Intentionally left blank--do not delete

## 6.
(3.4) Display (as a plot) the confusion matrix on the test set for the classifier trained in step 5.

In [None]:
# Result is a plot

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

C = confusion_matrix(y_test, yhat)
ConfusionMatrixDisplay(C).plot()

## 7. 
(3.4) Repeat step 5 with a pipeline that adds a standardization scaler before the kNN classifier. (There will not be much change, since the original columns have similar summary statistics.)

In [None]:
pipe_f1 = None

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
print(f"F1 score for pipeline = {pipe_f1:.6f}")

In [None]:
# TESTS
assert 0.8 < pipe_f1 < 0.85, "wrong value"
print("OK")

In [None]:
# Intentionally left blank--do not delete

## 8. 
(3.4) Retrain the pipeline from step 7 starting with a feature frame `X` that also includes the *redshift* column. Display the confusion matrix on the test set.

In [None]:
# Result is a plot

# YOUR CODE HERE
raise NotImplementedError()