In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling as pp
import matplotlib.pyplot as plt
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# **Data Exploration**

In [2]:
df = pd.read_csv('/content/magic_gamma_telescope04_.csv')

In [3]:
df.head(10)

Unnamed: 0,flength,fwidth,fsize,fconc,fconc1,fsym,fm3long,fm3trans,falpha,dist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g
5,51.624,21.1502,2.9085,0.242,0.134,50.8761,43.1887,9.8145,3.613,238.098,g
6,48.2468,17.3565,3.0332,0.2529,0.1515,8.573,38.0957,10.5868,4.792,219.087,g
7,26.7897,13.7595,2.5521,0.4236,0.2174,29.6339,20.456,-2.9292,0.812,237.134,g
8,96.2327,46.5165,4.154,0.0779,0.039,110.355,85.0486,43.1844,4.854,248.226,g
9,46.7619,15.1993,2.5786,0.3377,0.1913,24.7548,43.8771,-6.6812,7.875,102.251,g


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19020 entries, 0 to 19019
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   flength   19020 non-null  float64
 1   fwidth    19020 non-null  float64
 2   fsize     19020 non-null  float64
 3   fconc     19020 non-null  float64
 4   fconc1    19020 non-null  float64
 5   fsym      19020 non-null  float64
 6   fm3long   19020 non-null  float64
 7   fm3trans  19020 non-null  float64
 8   falpha    19020 non-null  float64
 9   dist      19020 non-null  float64
 10  class     19020 non-null  object 
dtypes: float64(10), object(1)
memory usage: 1.6+ MB


In [5]:
x = df.drop(['class'], axis=1)
y = df['class']

In [6]:
sns.pairplot(x)
plt.show()

In [7]:
le = LabelEncoder()
y = le.fit_transform(y)

# **Train-Test Split**

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

# **Feature Scaling**

In [9]:
scaler = StandardScaler()
x_train, x_test = scaler.fit_transform(x_train), scaler.transform(x_test)

# **Model Building and Evaluation**

In [10]:
accuracies = {}

# **Random Forest**

In [11]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100, 
    criterion='entropy', 
    random_state=0
)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='YlGnBu', annot=True)
plt.show()
accuracies['RandomForest'] = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracies['RandomForest']:.2f}%\n")
print(classification_report(y_test, y_pred, target_names=['gamma', 'hadron']))

Accuracy: 88.56%

              precision    recall  f1-score   support

       gamma       0.88      0.95      0.92      3079
      hadron       0.90      0.76      0.82      1676

    accuracy                           0.89      4755
   macro avg       0.89      0.86      0.87      4755
weighted avg       0.89      0.89      0.88      4755



# **Logistic Regression**

In [12]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=0)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='YlGnBu', annot=True)
plt.show()
accuracies['LogReg'] = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracies['LogReg']:.2f}%\n")
print(classification_report(y_test, y_pred, target_names=['gamma', 'hadron']))

Accuracy: 78.84%

              precision    recall  f1-score   support

       gamma       0.80      0.89      0.85      3079
      hadron       0.75      0.59      0.66      1676

    accuracy                           0.79      4755
   macro avg       0.78      0.74      0.75      4755
weighted avg       0.78      0.79      0.78      4755



# **Support Vector Machine**

In [13]:
from sklearn.svm import SVC

model = SVC(C=57, random_state=0)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='YlGnBu', annot=True)
plt.show()
accuracies['SVM'] = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracies['SVM']:.2f}%\n")
print(classification_report(y_test, y_pred, target_names=['gamma', 'hadron']))

Accuracy: 87.19%

              precision    recall  f1-score   support

       gamma       0.86      0.96      0.91      3079
      hadron       0.90      0.71      0.80      1676

    accuracy                           0.87      4755
   macro avg       0.88      0.84      0.85      4755
weighted avg       0.87      0.87      0.87      4755



# **Decision Tree**

In [14]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='YlGnBu', annot=True)
plt.show()
accuracies['DT'] = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracies['DT']:.2f}%\n")
print(classification_report(y_test, y_pred, target_names=['gamma', 'hadron']))

Accuracy: 81.43%

              precision    recall  f1-score   support

       gamma       0.86      0.85      0.86      3079
      hadron       0.73      0.75      0.74      1676

    accuracy                           0.81      4755
   macro avg       0.80      0.80      0.80      4755
weighted avg       0.82      0.81      0.81      4755



# **Neural Network**

In [15]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [16]:
x_train_nn, x_cv_nn, y_train_nn, y_cv_nn = train_test_split(x_train, y_train, test_size=0.25, random_state=0)

In [17]:
num_epochs = 6
history = model.fit(
    x_train_nn, y_train_nn, epochs=num_epochs, 
    validation_data=(x_cv_nn, y_cv_nn),
    steps_per_epoch=x_train.shape[0] // num_epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, verbose=2)
    ]
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.


In [18]:
loss_train = history.history['loss']
loss_validation = history.history['val_loss']
epochs = range(1, num_epochs + 1)
plt.plot(epochs, loss_train, 'g', label='Training')
plt.plot(epochs, loss_validation, 'b', label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss')
plt.legend()
plt.show()

In [19]:
acc_train = history.history['accuracy']
acc_validation = history.history['val_accuracy']
epochs = range(1, num_epochs + 1)
plt.plot(epochs, acc_train, 'g', label='Training')
plt.plot(epochs, acc_validation, 'b', label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy')
plt.legend()
plt.show()

In [20]:
y_pred = model.predict(x_test)
y_pred = [np.argmax(y) for y in y_pred]
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='YlGnBu', annot=True)
plt.show()
accuracies['NN'] = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracies['NN']:.2f}%\n")
print(classification_report(y_test, y_pred, target_names=['gamma', 'hadron']))

Accuracy: 87.15%

              precision    recall  f1-score   support

       gamma       0.88      0.93      0.90      3079
      hadron       0.86      0.76      0.81      1676

    accuracy                           0.87      4755
   macro avg       0.87      0.85      0.86      4755
weighted avg       0.87      0.87      0.87      4755



# **Results**

In [21]:
ax = sns.barplot(list(accuracies.keys()), list(accuracies.values()))
for p in ax.patches:
    ax.annotate(
        f'{p.get_height():2.2f}%', 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha = 'center', va = 'center', 
        xytext = (0, -20), textcoords = 'offset points'
    )
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.show()

