# Importing libraries and dependencies

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

import time #Recording execution times

In [None]:
# Execution time of entire program
programstart = time.time()

In [None]:
# Checking for null values to remove inconsistencies
sdf = pd.read_excel('Solar_Flares_Dataset.xlsx')
sdf.isnull().values.any()

In [None]:
#Removing Unncessecary Extension names
sdf.rename(columns = 
           {'\xa0Activity':'Activity', 'Evolution\xa0':'Evolution', 
            'Previous 24 hour flare activity code\xa0': 'Previous 24 hour flare activity code', 
            '. Area of the largest spot\xa0':'Area of the largest spot',
            'M-class flares\xa0':'M-class flares'},
            inplace = True)

In [None]:
# Changing String Values to Int values for first 3 columns

sdf['modified Zurich class'].replace(['A'],['1.0'],inplace=True)
sdf['modified Zurich class'].replace(['B'],['2.0'],inplace=True)
sdf['modified Zurich class'].replace(['C'],['3.0'],inplace=True)
sdf['modified Zurich class'].replace(['D'],['4.0'],inplace=True)
sdf['modified Zurich class'].replace(['E'],['5.0'],inplace=True)
sdf['modified Zurich class'].replace(['F'],['6.0'],inplace=True)
sdf['modified Zurich class'].replace(['H'],['7.0'],inplace=True)


sdf['largest spot size'].replace(['X'],['1.0'],inplace=True)
sdf['largest spot size'].replace(['R'],['2.0'],inplace=True)
sdf['largest spot size'].replace(['S'],['3.0'],inplace=True)
sdf['largest spot size'].replace(['A'],['4.0'],inplace=True)
sdf['largest spot size'].replace(['H'],['5.0'],inplace=True)
sdf['largest spot size'].replace(['K'],['6.0'],inplace=True)


sdf['spot distribution'].replace(['X'],['1.0'],inplace=True)
sdf['spot distribution'].replace(['O'],['2.0'],inplace=True)
sdf['spot distribution'].replace(['I'],['3.0'],inplace=True)
sdf['spot distribution'].replace(['C'],['4.0'],inplace=True)

In [None]:
#sdf.drop(index=sdf[sdf['X-class flares'] == 0].index, inplace=True) #Discards 0 values

In [None]:
#Values of our result class
print(sdf['C-Class Flares'].value_counts())
print(sdf['M-class flares'].value_counts())
print(sdf['X-class flares'].value_counts())

In [None]:
#Separating the data and labels
#X stores attributes (excluding outcome)
#Y stores result column
X = sdf.drop(columns = ['C-Class Flares', 'X-class flares','M-class flares'], axis=1)
y = sdf['C-Class Flares']

# Standardising the Data (Coverting DF Attributes to Lists)

In [None]:
#Testing training execution time
trainstart = time.time()

# Containing the standard scaler method in a variable
scaler = StandardScaler()

In [None]:
# This is the data we are going to be using to train the model
# Transforming the data to a list
scaler.fit(X) 
standardized_data = scaler.transform(X)

In [None]:
#Displaying the standardized data
standardized_data 

In [None]:
# X will represent the data that feeds into the AI
# y represents the model, the predictor based on the information provided

X = standardized_data

In [None]:
# Displaying X and Y

print(X)
print('')
print(y)

# Train, Test split

In [None]:
# Storing a train and test variable for both X and y (4 variables)
# stratify=y makes a split, so the sample of values produced will be the same (for consistent results)
# random_state=42 for consistent sample values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [None]:
pd.DataFrame(X_train, columns = 
             ['modified Zurich class',
                 'largest spot size',
                 'spot distribution',
                 'Activity',
                 'Evolution',
                 'Previous 24 hour flare activity code',
                 'Historically-complex',
                 "Did region become historically complex on this pass across the sun's disk",
                 'Area',
                 'Area of the largest spot',
                 ])

In [None]:
pd.DataFrame(y_train, columns = ['C-Class Flares'])

In [None]:
# X.shape = The entire dataset
# X_train.shape = how much data going to be used for training
# X_test.shape = how much data going to be used for testing

print(X.shape, X_train.shape, X_test.shape)

# Model Training

In [None]:
# Using a linear kernel with SVC function
# Generates a linear model
classifier = svm.SVC(kernel='linear') 

In [None]:
# Training the SVM by fitting training variables
classifier.fit(X_train, y_train)

In [None]:
#Train time recorded
trainend = time.time()
print("Training execution time:", trainend - trainstart)

# Testing the model

In [None]:
# Testing execution of test time
teststart = time.time()

# Prediction score on the training data
# A prediction will be made for X_train, storing it in a training prediction variable (X_train_prediction)
# The y_train label represents how the variable can translate its own prediction based on x_train

X_train_prediction = classifier.predict(X_train)
training_accuracy = accuracy_score(X_train_prediction, y_train)

In [None]:
X_test_prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, y_test)

In [None]:
print('Accuracy score of training data: ', training_accuracy)
print('Accuracy score of testing data: ', test_accuracy)

In [None]:
testend = time.time()
print("Testing execution time:", testend - teststart)

# Creating Confusion Matrix Graph - Heat Map Representation

In [None]:
def plotting2(y_true, y_pred):
    labels = unique_labels(y_test)
    column = [f'Predicted {label}' for label in labels]
    indices = [f'Actual {label}' for label in labels]
    table = pd.DataFrame(confusion_matrix(y_true, y_pred), columns=column, index=indices)
    
    return sns.heatmap(table, annot = True, fmt='d', cmap='cividis')

In [None]:
unique_labels(y_test)

In [None]:
plotting2(y_test, X_test_prediction)

In [None]:
print(classification_report(y_test, X_test_prediction))

# Predicting Solar Flares from Dataset Information

In [None]:
# Input the data of a patient within input data:
# input_data order = 
    #           1. 'modified Zurich class',
    #           2. 'largest spot size',
    #           3. 'spot distribution',
    #           4. 'Activity',
    #           5. 'Evolution',
    #           6. 'Previous 24 hour flare activity code',
    #           7. 'Historically-complex',
    #           8. 'Did region become historically complex on this pass across the sun's disk',
    #           9. 'Area',
    #           10.'Area of the largest spot
    
    
input_data = (7.0,4.0,1.0,1,3,1,1,1,1,1)

# Changing input_data to a numpy array
numpy_array_input_data = np.asarray(input_data)

# Reshape array for one instance, otherwise the model expects 768 instances (the shape of the base dataframe)
reshaped_input_data = numpy_array_input_data.reshape(1, -1)

# Standardize the input data
std_data = scaler.transform(reshaped_input_data)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

In [None]:
# Outcome 0 = No solar flare, else there exists a solar flare.

if (prediction[0] == 0):
    print('There are no solar flares')
else:
    print('There is a solar flare present')

In [None]:
print('Accuracy score of training data: ', training_accuracy)
print('Accuracy score of testing data: ', test_accuracy)

In [None]:
programend = time.time()
print("Program Execution Time:", programend - programstart)