# icc-MAGIC-Example

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [40]:
# column names
cols = ["flength", "fwidtth", "fSize", "fConc", "fConc1", "fAsym","fM3long",
        "fM3Trans","fAlpha", "fdist","class"]

In [41]:
#csv =   comma separate values 
df = pd.read_csv("magic04.data", names=cols)

#print the first five values
print(df.head())

# prints all the rows where the class is labelled 0
# print(df[df["class"]==0])

    flength   fwidtth   fSize   fConc  fConc1     fAsym  fM3long  fM3Trans  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110   -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   

    fAlpha     fdist class  
0  40.0920   81.8828     g  
1   6.3609  205.2610     g  
2  76.9600  256.7880     g  
3  10.4490  116.7370     g  
4   4.6480  356.4620     g  


In [42]:
#To allow the computer to better interpret data we change g and h to 1's and 0s by casting as an int
df["class"] = (df["class"] == "g").astype(int)


In [43]:
# # for each of the data points in the last column (labelled class)
# for label in cols[:-1]:
    
#     plt.figure()
#     plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7,density=True)
#     plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7,density=True)
#     plt.title(label)
#     plt.ylabel("Probability")
#     plt.xlabel(label)
#     plt.legend()
   
# plt.show()   

In [44]:
#CREATE TRAIN VALIDATION AND TEST DATASETS
#np.split split the data
#df.sample shuffles 100% of the data (frac = 0.5 would be shuffling 50%)
#split the data at 60% - 80% of the data set 
#split the data at 80% - 100% of the data set to be test data
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])   

# john = train[train.columns[0:-1]].values

# scaler = StandardScaler()
# john = scaler.fit_transform(john) #scales to unit variance 
 
# print(john)

In [45]:
# we can scale to have all the data with respect to a mean 
def scale_dataset(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values #creqte a new dataframe for all values up to the last value
    y = dataframe[dataframe.columns[-1]].values #get the values in the last column (This would be 0's and 1's)

    scaler = StandardScaler() #defines a type of scalr 
    X = scaler.fit_transform(X) #scales to unit variance 

    if oversample:
        ros = RandomOverSampler()
        X,y = ros.fit_resample(X,y)

    data = np.hstack((X,np.reshape(y,(-1,1))))

    return data, X, y

In [46]:

train, X_train, Y_train = scale_dataset(train, oversample=True)
valid, X_valid, Y_valid = scale_dataset(valid, oversample=False)
test, X_test, Y_test = scale_dataset(test, oversample=False)

# kNN

In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [65]:
knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train, Y_train)

In [69]:
y_pred = knn_model.predict(X_test)
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.72      0.74      1306
           1       0.86      0.89      0.87      2498

    accuracy                           0.83      3804
   macro avg       0.81      0.80      0.81      3804
weighted avg       0.83      0.83      0.83      3804



[1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
