In [1]:
# Pulsars are a rare type of Neutron star that produce radio emission detectable here on Earth. 
# They are of considerable scientific interest as probes of space-time, the inter-stellar medium, and states of matter. 
# Machine learning tools are now being used to automatically label pulsar candidates to facilitate rapid analysis. 
# Classification systems in particular are being widely adopted,which treat the candidate data sets as binary classification problems.

In [2]:
import pandas as pd

In [3]:
ps = pd.read_csv(r'F:\project\Pulsar.csv')

In [4]:
ps.head()

Unnamed: 0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [5]:
ps.shape

(17898, 9)

In [6]:
ps.isnull().sum()

Mean_Integrated         0
SD                      0
EK                      0
Skewness                0
Mean_DMSNR_Curve        0
SD_DMSNR_Curve          0
EK_DMSNR_Curve          0
Skewness_DMSNR_Curve    0
Class                   0
dtype: int64

In [53]:
ps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Mean_Integrated       17898 non-null  float64
 1   SD                    17898 non-null  float64
 2   EK                    17898 non-null  float64
 3   Skewness              17898 non-null  float64
 4   Mean_DMSNR_Curve      17898 non-null  float64
 5   SD_DMSNR_Curve        17898 non-null  float64
 6   EK_DMSNR_Curve        17898 non-null  float64
 7   Skewness_DMSNR_Curve  17898 non-null  float64
 8   Class                 17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [7]:
from sklearn.model_selection import train_test_split
train_ps , test_ps = train_test_split(ps,test_size=.3)

In [8]:

train_ps_x = train_ps.iloc[:,:-1]
train_ps_y = train_ps.iloc[:,-1]

test_ps_x = test_ps.iloc[:,:-1]
test_ps_y = test_ps.iloc[:,-1]

In [9]:
# # with over sampling
# ps1 = train_ps[train_ps.Class ==1]

In [10]:
# ps1.shape

In [11]:
# train_ps1 = pd.concat([train_ps,ps1,ps1,ps1,ps1,ps1])

In [12]:

# train_ps_x = train_ps1.iloc[:,:-1]
# train_ps_y = train_ps1.iloc[:,-1]

# using SMOT for class imbalance

In [13]:
#pip install imblearn

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
se = SMOTE()


In [16]:
se.fit(train_ps_x,train_ps_y)

In [17]:
psx_train , psy_train = se.fit_resample(train_ps_x,train_ps_y)

In [18]:
psy_train.value_counts()

0    11414
1    11414
Name: Class, dtype: int64

# Logistic regression

In [19]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [20]:
lg.fit(psx_train , psy_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
pred_lg = lg.predict(test_ps_x)

In [22]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,accuracy_score

In [23]:
confusion_matrix(test_ps_y,pred_lg)

array([[4705,  140],
       [  43,  482]], dtype=int64)

In [24]:
accuracy_score(test_ps_y,pred_lg)

0.9659217877094972

In [25]:
precision_score(test_ps_y,pred_lg)

0.77491961414791

In [26]:
recall_score(test_ps_y,pred_lg)

0.9180952380952381

# with feature sampling

In [61]:
lg.fit(train_pl_x , train_pl_y)

In [62]:
pred_lgf = lg.predict(test_pl_x)

In [63]:
confusion_matrix(test_pl_y,pred_lgf)

array([[3229,   27],
       [  70,  254]], dtype=int64)

In [64]:
accuracy_score(test_pl_y,pred_lgf)

0.9729050279329609

# decision tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [28]:
dt.fit(train_ps_x,train_ps_y)

In [29]:
pred_dt = dt.predict(test_ps_x)

In [30]:
confusion_matrix(test_ps_y,pred_dt)

array([[4773,   72],
       [  79,  446]], dtype=int64)

In [31]:
accuracy_score(test_ps_y,pred_dt)

0.9718808193668529

In [32]:
precision_score(test_ps_y,pred_dt)

0.861003861003861

In [33]:
recall_score(test_ps_y,pred_dt)

0.8495238095238096

# Feture selection

In [49]:
fe = pd.DataFrame()

In [50]:
fe['Columns'] = train_ps_x.columns
fe['Importance'] = dt.feature_importances_

In [52]:
fe.sort_values(by='Importance',ascending=False)

Unnamed: 0,Columns,Importance
2,EK,0.781404
5,SD_DMSNR_Curve,0.04273
4,Mean_DMSNR_Curve,0.037634
0,Mean_Integrated,0.034487
6,EK_DMSNR_Curve,0.029888
1,SD,0.026035
7,Skewness_DMSNR_Curve,0.025158
3,Skewness,0.022663


In [56]:
pl = ps[['EK','SD_DMSNR_Curve','Mean_DMSNR_Curve','Mean_Integrated','Class']]

In [57]:
train ,test = train_test_split(pl,test_size=.2)

In [58]:

train_pl_x = train.iloc[:,:-1]
train_pl_y = train.iloc[:,-1]

test_pl_x = test.iloc[:,:-1]
test_pl_y = test.iloc[:,-1]

# Random forest


In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [35]:
rf.fit(train_ps_x,train_ps_y)

In [36]:
pred_rf = rf.predict(test_ps_x)

In [37]:
confusion_matrix(test_ps_y,pred_rf)

array([[4821,   24],
       [  82,  443]], dtype=int64)

In [38]:
accuracy_score(test_ps_y,pred_rf)

0.9802607076350093

In [39]:
precision_score(test_ps_y,pred_rf)

0.9486081370449678

In [40]:
recall_score(test_ps_y,pred_rf)

0.8438095238095238

# KNN

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
kn = KNeighborsClassifier()

In [43]:
kn.fit(train_ps_x,train_ps_y)

In [44]:
pred_kn = kn.predict(test_ps_x)

In [45]:
confusion_matrix(test_ps_y,pred_kn)

array([[4800,   45],
       [ 107,  418]], dtype=int64)

In [46]:
accuracy_score(test_ps_y,pred_kn)

0.9716945996275606

In [47]:
precision_score(test_ps_y,pred_kn)

0.9028077753779697

In [48]:
recall_score(test_ps_y,pred_kn)

0.7961904761904762