In [33]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv("asset-v1-ITMOUniversity+INTROMLADVML+fall_2023_ITMO_mag+type@asset+block@pulsar_stars_new.csv")
data.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC,TG
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MIP     17898 non-null  float64
 1   STDIP   17898 non-null  float64
 2   EKIP    17898 non-null  float64
 3   SIP     17898 non-null  float64
 4   MC      17898 non-null  float64
 5   STDC    17898 non-null  float64
 6   EKC     17898 non-null  float64
 7   SC      17898 non-null  float64
 8   TG      17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [4]:
data.describe()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC,TG
count,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0
mean,111.079968,46.549532,0.477857,1.770279,12.6144,26.326515,8.303556,104.857709,0.091574
std,25.652935,6.843189,1.06404,6.167913,29.472897,19.470572,4.506092,106.51454,0.288432
min,5.8125,24.772042,-1.876011,-1.791886,0.213211,7.370432,-3.13927,-1.976976,0.0
25%,100.929688,42.376018,0.027098,-0.188572,1.923077,14.437332,5.781506,34.960504,0.0
50%,115.078125,46.947479,0.22324,0.19871,2.801839,18.461316,8.433515,83.064556,0.0
75%,127.085938,51.023202,0.473325,0.927783,5.464256,28.428104,10.702959,139.30933,0.0
max,192.617188,98.778911,8.069522,68.101622,223.392141,110.642211,34.539844,1191.000837,1.0


In [5]:
data = data.loc[(data['MIP'] >= 10) & (data['MIP'] <= 100)]
data.shape[0]

4218

In [7]:
print(f'sample mean for the column MIP: {round(data.MIP.mean(), 3)}\nminimum value from the column MIP: {round(data.MIP.min(), 3)}')

sample mean for the column MIP: 77.053
minimum value from the column MIP: 10.008


In [8]:
data = data.sort_values(by=['SIP'], ascending=True)

In [18]:
X = data.copy()
y = data.TG

X_train, X_test, y_train, y_test = train_test_split(X.drop(columns='TG'), y, test_size=0.20, random_state=33, stratify=X['TG'])

In [11]:
print(f'maximal value of the column STDC from the training sample: {round(X_train.STDC.max(), 3)}')

maximal value of the column STDC from the training sample: 109.655


In [20]:
scaler = MinMaxScaler()
columns = X_train.columns
X_train = scaler.fit_transform(X_train)

In [21]:
X_train = pd.DataFrame(X_train, columns=columns)
X_train.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC
0,0.53963,0.262537,0.321231,0.150966,0.09088,0.424381,0.149904,0.008344
1,0.996441,0.327414,0.031842,0.017488,0.007554,0.05382,0.391557,0.150499
2,0.66221,0.230554,0.11942,0.056781,0.009783,0.072543,0.348471,0.112724
3,0.990711,0.206057,0.001224,0.043113,0.010634,0.120162,0.309159,0.07077
4,0.839396,0.202019,0.081787,0.050389,0.004276,0.032344,0.489895,0.258204


In [22]:
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=columns)
X_test.head()

Unnamed: 0,MIP,STDIP,EKIP,SIP,MC,STDC,EKC,SC
0,0.597969,0.249879,0.130854,0.057166,0.005402,0.051303,0.418484,0.170221
1,0.767428,0.214288,0.105393,0.053862,0.004073,0.054431,0.445064,0.180652
2,0.955465,0.207856,0.048451,0.046221,0.003611,0.046095,0.491063,0.231576
3,0.05747,0.154206,0.68738,0.486569,0.442514,0.532695,0.085714,0.001599
4,0.139422,0.092024,0.760695,0.61568,0.179061,0.502447,0.12537,0.00464


In [23]:
print(f'sample mean for column  from the training sample (after normalization): {round(X_train.STDIP.mean(), 3)}')

sample mean for column  from the training sample (after normalization): 0.256


In [25]:
model_LGR = LogisticRegression()
model_LGR.fit(X_train, y_train)

In [28]:
pred = model_LGR.predict(X_test)
confusion_matrix(y_test, pred)

array([[539,  18],
       [ 41, 246]])

In [29]:
print(f'f1-score: {round(f1_score(y_test, pred), 3)}')

f1-score: 0.893


In [35]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [36]:
pred = knn.predict(X_test)
confusion_matrix(y_test, pred)

array([[540,  17],
       [ 29, 258]])

In [37]:
print(f'f1-score: {round(f1_score(y_test, pred), 3)}')

f1-score: 0.918
