In [1]:
%pylab inline
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix

Populating the interactive namespace from numpy and matplotlib


In [2]:
full_df = pd.read_csv('full_ML_data.csv', index_col=0)
full_df = full_df[full_df.flux_75.notnull()]

df_shuffle = full_df.sample(frac = 1)

train_len = int(0.6*len(full_df))
val_len = int(0.8*len(full_df))

gal_train, gal_val, gal_test = df_shuffle[0:train_len], df_shuffle[train_len:val_len], df_shuffle[val_len:]

In [3]:
print(len(gal_train), len(gal_val), len(gal_test))
print(gal_train.tail(5))
print(gal_val.head(5))

1642 548 548
      galaxy_id  is_AGN    flux_75    flux_25    vmed_75    vmed_25  \
707   7975-1901     0.0  10.055858  10.055858  23.028712  23.028712   
1655  8549-6101     0.0   2.662383   2.662383 -32.041042 -32.041042   
1306  8602-3703     0.0  10.240530  10.240530 -39.168100 -39.168100   
932   8078-1902     0.0   0.593262   0.593262  30.768499  30.768499   
2221  8626-9101     0.0   0.482920   0.482920  16.158447  16.158447   

          W80_75      W80_25  mean_gauss  
707   258.201852  258.201852    1.410982  
1655  466.853035  466.853035    1.512821  
1306  203.075080  203.075080    1.307692  
932   403.286044  403.286044    1.457064  
2221  218.296459  218.296459    1.075085  
       galaxy_id  is_AGN   flux_75   flux_25     vmed_75     vmed_25  \
1930   8134-6103     0.0  1.593012  1.593012    9.655353    9.655353   
2134   8458-6104     0.0  1.664851  1.664851   18.523583   18.523583   
2047   8459-6104     0.0  3.061105  3.061105   11.541534   11.541534   
324   8616-127

In [4]:
X_train = gal_train.drop(['is_AGN','galaxy_id'],axis=1)
Y_train = gal_train['is_AGN']
X_val = gal_val.drop(['is_AGN','galaxy_id'],axis=1).copy()
Y_val = gal_val['is_AGN']

In [5]:
X_train

Unnamed: 0,flux_75,flux_25,vmed_75,vmed_25,W80_75,W80_25,mean_gauss
2009,1.131377,1.131377,26.722164,26.722164,207.472810,207.472810,1.155386
745,1.793358,1.793358,20.544222,20.544222,213.293092,213.293092,1.240088
1329,1.562498,1.562498,4.938704,4.938704,270.815564,270.815564,1.226244
2688,1.648742,1.648742,21.921141,21.921141,314.920246,314.920246,1.276438
2694,1.119096,1.119096,21.830843,21.830843,483.720008,483.720008,1.536913
1527,0.173014,0.173014,177.050053,177.050053,351.304579,351.304579,1.444444
2651,0.179047,0.179047,-47.303859,-47.303859,205.205762,205.205762,1.153409
227,1.013952,1.013952,25.609948,25.609948,205.033343,205.033343,1.130390
2457,0.346860,0.346860,33.086787,33.086787,367.245284,367.245284,1.335616
519,0.420932,0.420932,15.091239,15.091239,195.456500,195.456500,1.123077


In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_val)
acc_log = round(logreg.score(X_val, Y_val) * 100, 2)
print(acc_log)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

91.06
0.9105839416058394


In [7]:
gal_train.columns.delete([0,8])

Index(['is_AGN', 'flux_75', 'flux_25', 'vmed_75', 'vmed_25', 'W80_75',
       'W80_25'],
      dtype='object')

In [8]:
coeff_df = pd.DataFrame(gal_train.columns.delete([0,1]))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
2,vmed_75,0.003026
3,vmed_25,0.003026
4,W80_75,0.001376
5,W80_25,0.001376
0,flux_75,0.000849
1,flux_25,0.000849
6,mean_gauss,-1.830811


In [9]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_val)
acc_svc = round(svc.score(X_val, Y_val) * 100, 2)
print(acc_svc)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

90.88
0.9087591240875912


In [10]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_val)
acc_knn = round(knn.score(X_val, Y_val) * 100, 2)
print(acc_knn)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

89.05
0.8905109489051095


In [11]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_val)
acc_gaussian = round(gaussian.score(X_val, Y_val) * 100, 2)
print(acc_gaussian)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

90.15
0.9014598540145985


In [12]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_val)
acc_perceptron = round(perceptron.score(X_val, Y_val) * 100, 2)
print(acc_perceptron)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

87.04
0.8704379562043796




In [13]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_val)
acc_linear_svc = round(linear_svc.score(X_val, Y_val) * 100, 2)
print(acc_linear_svc)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

91.06
0.9105839416058394


In [14]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_val)
acc_sgd = round(sgd.score(X_val, Y_val) * 100, 2)
print(acc_sgd)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

88.87
0.8886861313868613




In [15]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_val)
acc_decision_tree = round(decision_tree.score(X_val, Y_val) * 100, 2)
print(acc_decision_tree)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

82.3
0.822992700729927


In [16]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_val)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_val, Y_val) * 100, 2)
print(acc_random_forest)
print(1 - sum(abs(Y_val-Y_pred))/len(gal_val))

90.15
0.9014598540145985


In [17]:
X_test =gal_test.drop(['is_AGN','galaxy_id'],axis=1).copy()
Y_test = gal_test['is_AGN']

Y_pred = logreg.predict(X_test)
print(1 - sum(abs(Y_test-Y_pred))/len(gal_test))
print(len(gal_test) - sum(abs(Y_test-Y_pred)))

0.8941605839416058
490.0


In [18]:
confusion_matrix(Y_test,Y_pred)

array([[490,   0],
       [ 58,   0]])

full_df = pd.read_csv('full_ML_data.csv', index_col=0)
full_df = full_df[full_df.flux_75.notnull()]

full_df.flux_75 /= max(full_df['flux_75'])
full_df.flux_25 /= max(full_df['flux_25'])
full_df.W80_75 /= max(full_df['W80_75'])
full_df.W80_25 /= max(full_df['W80_25'])
full_df.vmed_75 /= max(full_df['vmed_75'])
full_df.vmed_25 /= max(full_df['vmed_25'])
full_df.mean_gauss /= max(full_df['mean_gauss'])

df_shuffle = full_df.sample(frac = 1)

train_len = int(0.6*len(full_df))
val_len = int(0.8*len(full_df))

gal_train, gal_val, gal_test = df_shuffle[0:train_len], df_shuffle[train_len:val_len], df_shuffle[val_len:]