In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, confusion_matrix

In [2]:
data = pd.read_csv('asset-v1-ITMOUniversity+INTROMLADVML+fall_2023_ITMO_mag+type@asset+block@pulsar_stars_new.csv')
data.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC,TG
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MIP     17898 non-null  float64
 1   STDIP   17898 non-null  float64
 2   EKIP    17898 non-null  float64
 3   SIP     17898 non-null  float64
 4   MC      17898 non-null  float64
 5   STDC    17898 non-null  float64
 6   EKC     17898 non-null  float64
 7   SC      17898 non-null  float64
 8   TG      17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [14]:
sample_1 = data.loc[(data['TG'] == 0) & (data['MIP'] >= 59.203125) & (data['MIP'] <= 173.7421875)]
sample_2 = data.loc[(data['TG'] == 1) & (data['MIP'] >= 7.828125) & (data['MIP'] <= 115.2421875)]

data_new = pd.concat([sample_1, sample_2])
data_new.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC,TG
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [30]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17793 entries, 0 to 17876
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MIP     17793 non-null  float64
 1   STDIP   17793 non-null  float64
 2   EKIP    17793 non-null  float64
 3   SIP     17793 non-null  float64
 4   MC      17793 non-null  float64
 5   STDC    17793 non-null  float64
 6   EKC     17793 non-null  float64
 7   SC      17793 non-null  float64
 8   TG      17793 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.4 MB


In [15]:
round(data_new.SIP.mean(), 3)

1.65

In [16]:
data_new_sort = data_new.sort_values(by=['SIP'], ascending=True)
data_new_sort.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC,TG
2137,135.859375,81.503042,0.023921,-1.791886,3.655518,19.638924,7.228962,63.401192,0
5291,101.242188,91.808628,0.290115,-1.781888,195.795987,63.624015,-1.85257,2.544999,0
3509,113.265625,98.778911,0.179404,-1.764717,145.361204,64.773789,-0.154935,-1.374633,0
10625,141.757812,91.206475,-0.056413,-1.755332,55.243311,75.793948,1.047847,-0.255939,0
370,107.539062,86.951396,-0.008928,-1.676724,211.948997,54.01031,-2.556795,5.768458,0


In [17]:
X = data_new_sort.drop(columns='TG')
y = data_new_sort['TG']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41, stratify=y)

In [19]:
round(X_train.EKC.mean(), 3)

8.362

In [20]:
scaler = MinMaxScaler()
features = X_train.columns
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

In [21]:
round(X_train_scaled.MC.mean(), 3)

0.053

In [22]:
baseline = LogisticRegression(random_state=41)
baseline.fit(X_train_scaled, y_train)

tn, fp, fn, tp = confusion_matrix(y_test, baseline.predict(X_test_scaled)).ravel()
(tn, fp, fn, tp)

(3238, 6, 73, 241)

In [23]:
print(f'f1 score for the test dataset: {round(f1_score(y_test, baseline.predict(X_test_scaled)), 3)}')

f1 score for the test dataset: 0.859


In [24]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

tn, fp, fn, tp = confusion_matrix(y_test, knn.predict(X_test_scaled)).ravel()
(tn, fp, fn, tp)

(3225, 19, 60, 254)

In [25]:
print(f'f1 score for the test dataset: {round(f1_score(y_test, knn.predict(X_test_scaled)), 3)}')

f1 score for the test dataset: 0.865
