In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Pulsars are detected by scanning for radio frequencies from deep space. However, the vast majority of collected signals are actually terrestrial noise. The following dataset collects 8 statistical features from radio signals and their classifications as noise (class 0) or pulsar (class 1).

In [2]:
signals = pd.read_csv("pulsars.csv")
signals


Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


## 1.

Let `y` be the `class` column of the dataset and let `X` be all the remaining columns. Find the proportion of the samples that are actually pulsars.

In [3]:
X, y = None, None
pulsar_fraction = None
X = signals.drop(columns=['class'])
y = signals['class']

pulsar_fraction = (y == 1).mean()

pulsar_fraction

np.float64(0.09157447759526204)

In [4]:
X

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306
...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910


In [5]:
print(f"{pulsar_fraction:.1%} of the samples are pulsars.")

9.2% of the samples are pulsars.


In [6]:
# TESTS
assert X.shape == (17898, 8)
assert y.shape == (17898,)
assert 0 < pulsar_fraction < 1, "Pulsar fraction value is wrong"
print("OK")


OK


## 2.

Using a random state `19716`, split the data 80% / 20% into testing and training sets.

In [7]:
X_train, X_test = None, None
y_train, y_test = None, None
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=19716
)

In [8]:
X_train

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis
13749,119.492188,54.379453,0.120607,-0.519653,8.030936,36.662885,4.686481,21.277059
11637,117.046875,44.226487,0.379639,0.364920,2.249164,13.485213,9.933788,133.656497
14588,134.328125,42.006902,-0.223974,0.174592,2.209866,15.243517,9.729774,113.742010
15239,134.945312,46.698218,-0.135601,-0.069129,4.323579,24.283577,6.012013,37.614791
16246,129.796875,54.614467,0.038839,-0.221016,1.417224,14.110967,12.300153,171.274870
...,...,...,...,...,...,...,...,...
13600,109.960938,44.949521,0.253431,0.566471,1.885452,12.371997,10.834576,167.878306
4080,112.632812,52.333944,0.403222,0.023184,2.458194,13.678551,9.009352,117.517856
1775,98.359375,51.190626,1.057578,0.759414,134.582776,94.924652,-0.481317,-1.497141
17102,117.023438,64.912377,0.425613,-0.541779,2.830268,17.946514,8.996621,91.315831


In [9]:
# TESTS
assert X_train.shape == (14318, 8)
assert y_test.sum() == 326

## 3.
Fit a pipeline with standardization scaling and a kNN classifier with $k=8$ to the training data. Since we want to minimize false positives, find its precision score on the test set.

In [None]:
signals = pd.read_csv("pulsars.csv")

X = signals.drop(columns=['class'])
y = signals['class']

pulsar_fraction = (y == 1).mean()
print(f"Proportion of pulsars: {pulsar_fraction:.4f}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=19716
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=8))
])

pipeline.fit(X_train, y_train)

Proportion of pulsars: 0.0916


In [23]:
print(f"Test score: {knn_score:.2%}")

Test score: 97.74%


In [22]:
# TESTS
assert 0.9 < knn_score < 0.94
print("OK")


AssertionError: 

## 4.

(4.4) Perform a grid search on the pipeline defined in step 3, including a search over $k$ from 3 through 20 and over `'uniform'` and `'distance'` for `weights`, using 6 folds in cross-validation and precision as the scoring. Find the best parameters and find the precision score on the test set.

In [30]:
best_params = None
grid_score = None

from sklearn.model_selection import cross_val_score

# Cross-validate original model
original_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=8))
])

original_cv = cross_val_score(original_pipeline, X, y, cv=6, scoring='precision').mean()
grid_cv = grid.best_score_

# Get test precision from the best model
y_pred = grid.best_estimator_.predict(X_test)
grid_score = precision_score(y_test, y_pred)


In [31]:
print(f"Best parameters: {best_params}")
print(f"Test score: {grid_score:.2%}")

Best parameters: None
Test score: 93.68%


In [32]:
print(f"Original kNN precision: {knn_score:.4f}")
print(f"GridSearch precision:   {grid_score:.4f}")

Original kNN precision: 0.9774
GridSearch precision:   0.9368


In [33]:
# TESTS
assert type(best_params) == dict, "Get the best parameters from the fitted model"
assert grid_score > knn_score, "Score should have improved"
print("OK")


AssertionError: Get the best parameters from the fitted model

## 5.

(4.3) Using the best model from step 4, make a bagging classifier using 200 estimators, 50% max features and max samples, and random state `302`. Find its precision score.

In [35]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import precision_score

# Best model from GridSearchCV
best_model = grid.best_estimator_

# Bagging classifier (fixed for newer sklearn)
bagging = BaggingClassifier(
    estimator=best_model,
    n_estimators=200,
    max_samples=0.5,
    max_features=0.5,
    random_state=302,
    n_jobs=-1
)

# Fit the ensemble
bagging.fit(X_train, y_train)

# Predict and evaluate precision
y_pred_ensemble = bagging.predict(X_test)
ensemble_score = precision_score(y_test, y_pred_ensemble)

In [36]:
print(f"Test score: {ensemble_score:.2%}")

Test score: 95.22%


In [37]:
# TESTS
assert ensemble_score > grid_score, "Score should have improved"
print("OK")


OK
