In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

### Import the data from database in "output" directory

Here, ns-maps are "no-signal maps" and ws-maps are "with signal maps".

The shape is as follows: (samples, features), so that if we have (10,200), it means we have 10 samples with 200 features. Here, the features are the pixel values, and since we have 400x400, there are 160000 features. We the number of samples is chosen in the get_data() function. 

get_data(n) should then output X,y where X.shape = $(2n, 160000)$ and y.shape = $(2n,)$. 

In [47]:
def get_data(n, amplitude=1, dataset='dataset_1'):
    '''
    returns the maps with and without cosmic string signal with appropriate labels (index matching).
    
    Parameters:
    --------------
    n : number of maps to be fetched for each type of map (thus it returns 2*l maps)
    
    amplitude : multiplicative factor used to augment the cosmic string signal
    
    dataset : the dataset from which the data is pulled
    
    '''
    #load the data
    path_ns = "../data/" + dataset + "/ns-maps/"
    path_ws = "../data/" + dataset + "/ws-maps/"
    
    #here the shape of arr_ns is n rows by 400^2 pixels, so (n,160000)
    arr_ns = np.array([np.load(path_ns + "ns-map"+str(i)+".npy").flatten() for i in range(n)]) #no signal maps (y=0)
    
    arr_ws_noise = np.array([np.load(path_ws + "noise/noise"+str(i)+".npy").flatten() for i in range(n)])
    arr_ws_signal = np.array([np.load(path_ws + "signal/signal"+str(i)+".npy").flatten() for i in range(n)])
    
    #here we combine the signal map and noise maps. We can add these this way by construction of arr_ws_noise and arr_ns_signal
    arr_ws = amplitude*arr_ws_signal + arr_ws_noise #with signal maps (y=1)
    
    #labels 
    y = np.append(np.zeros(n), np.ones(n))
    
    #an array of the form (n + n, 400^2) where the first n maps contain no signal and the last n do.
    X = np.concatenate((arr_ns,arr_ws),axis=0)
    
    #we return a shuffled version of the data
    return shuffle(X, y, random_state=0)
    

### Applying machine learning

In [105]:
def model_check_reg(n=100, n_splits=5,n_repeats=3, amplitude=10):
    '''
    fits the logistic regression model to the data. Used k-fold cross validation and returns the mean score and deviation.
    '''
    X, y = get_data(n, amplitude=amplitude)
    
    steps = list()
    steps.append(('scaler', StandardScaler()))
    steps.append(('model', LogisticRegression()))
    pipeline = Pipeline(steps=steps)
    
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)

    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    
    return scores, np.mean(scores), np.std(scores)

In [107]:
model_check_reg(n=100, amplitude=100)

(array([0.9  , 0.9  , 0.975, 0.75 , 0.8  , 0.825, 0.875, 0.95 , 0.85 ,
        0.85 , 0.9  , 0.9  , 0.85 , 0.925, 0.925]),
 0.8783333333333335,
 0.05691123692987958)

In [108]:
model_check_reg(n=80, amplitude=100)

(array([0.875  , 0.84375, 0.78125, 0.78125, 0.90625, 0.75   , 0.84375,
        0.9375 , 0.84375, 0.84375, 0.6875 , 0.84375, 0.875  , 0.96875,
        0.65625]),
 0.8291666666666667,
 0.08296794896558284)

In [109]:
model_check_reg(n=40, amplitude=100)

(array([1.    , 0.8125, 0.9375, 0.5   , 0.9375, 0.875 , 0.75  , 0.9375,
        0.75  , 0.75  , 0.9375, 0.75  , 0.875 , 0.9375, 0.8125]),
 0.8375,
 0.12247448713915891)