#### **Section 1: Data Loading**

In [6]:
#1.1 Data Loading
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib as plt
import seaborn as sns
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split


# Set random seeds for reproducibility
random.seed(42)
df = pd.read_csv('F:\Abdillah\LAB\Matlab\FaultDetection\DariYoutube\Simulink\Result.csv')

# # Check for missing values and duplicates
# data_df.dropna(inplace=True)
# data_df.drop_duplicates(inplace=True)

# Display the first few rows of the dataset
# print("First few rows of the dataset:")
# print(df.head())

 # Show number of columns and rows
# num_rows, num_columns = df.shape
# print(f"Number of rows: {num_rows}")
# print(f"Number of columns: {num_columns}")

df

ModuleNotFoundError: No module named 'matplotlib'

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   No                                240 non-null    int64  
 1   Phase-to-phase Voltage
(Vrms) kV  240 non-null    int64  
 2   Phase angle                       240 non-null    int64  
 3   Frequency                         240 non-null    int64  
 4   Fault ground
Resistance Ω         240 non-null    float64
 5   FaultLoc                          240 non-null    object 
 6   FaultType                         240 non-null    object 
 7   MaxCoefA                          240 non-null    float64
 8   MaxCoefB                          240 non-null    float64
 9   MaxCoefC                          240 non-null    float64
 10  MaxCoefGnd                        240 non-null    float64
 11  A                                 240 non-null    int64  
 12  B       

In [8]:
df.shape

(240, 15)

In [9]:
# First we want to determine the numbers and rates of faulty and normal observations in data set
no_faults = ((df["G"] == 0) & (df["C"] == 0) & (df["B"] == 0) & (df["B"] == 0)).value_counts()
no_faults

False    220
True      20
Name: count, dtype: int64

In [10]:
cmap = ["#3274a1", "#e1812c", "#3a923a", "#c03d3e", "#857aab", "#8d7866"]
plt.pie(x = no_faults, 
        explode = [0.1, 0.1],
        labels = ["faulty", "normal"],
        colors = ["#c03d3e", "#3a923a"],
        autopct = "%.1f%%", 
        shadow = True)
plt.title("General Fault Rate");

NameError: name 'plt' is not defined

In [11]:
# Second we want to seperate ground and phase faults
gnd_faults = df["G"].value_counts()
gnd_faults

G
1    140
0    100
Name: count, dtype: int64

In [13]:

plt.pie(x = gnd_faults, 
        explode = [0.1, 0.1],
        labels = ["phase_fault", "gnd_fault"],
        colors = ["#857aab", "#8d7866"],
        autopct = "%.1f%%", 
        shadow = True)
plt.title("Ground Faults");

NameError: name 'plt' is not defined

In [19]:
# For detailed fault types we represent faults in one fault type column: FType, 
# which we will use it as a target column later.
df["FType"] = df["A"].astype("str") + df["B"].astype("str") + df["C"].astype("str") + df["G"].astype("str")
df

Unnamed: 0,No,Phase-to-phase Voltage\n(Vrms) kV,Phase angle,Frequency,Fault ground\nResistance Ω,FaultLoc,FaultType,MaxCoefA,MaxCoefB,MaxCoefC,MaxCoefGnd,A,B,C,G,FType
0,1,250,0,60,0.1,Before transmission line,ABC-G,16097055.71,40725475.36,16097055.71,79893.30,1,1,1,1,1111
1,2,250,0,60,0.1,Before transmission line,ABC,16097055.71,40725475.36,16097055.71,0.01,1,1,1,0,1110
2,3,250,0,60,0.1,Before transmission line,AB-G,10803035.42,20464068.14,103.98,86421.48,1,1,0,1,1101
3,4,250,0,60,0.1,Before transmission line,AC-G,20261370.35,103.98,8624427.56,202614.07,1,0,1,1,1011
4,5,250,0,60,0.1,Before transmission line,BC-G,103.98,40725475.36,7176136.72,99363.73,0,1,1,1,0111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,236,250,0,60,1.0,After transmission line,B-C,115.72,594.60,735.53,0.01,0,1,1,0,0110
236,237,250,0,60,1.0,After transmission line,A-G,852.78,75.29,102.68,778.38,1,0,0,1,1001
237,238,250,0,60,1.0,After transmission line,B-G,114.87,421.49,107.07,659.07,0,1,0,1,0101
238,239,250,0,60,1.0,After transmission line,C-G,131.07,131.07,846.38,591.62,0,0,1,1,0011


#### **Section 2: Features Engineering**

1. Data Processing(Data Cleaning, Data Selection, Features Selection)
   1. Data View
   2. Data Cleaning
   3. Data Selection
   4. Data Formatting
   5. Features and Label Selection
2. Splitting
3. Scaling,Normalizing, Standard, etc
4. Tensor Initialization

In [2]:
#2.1.1-2 Data View, Data Cleaning
df.drop(["No", "Phase-to-phase Voltage\n(Vrms) kV", "Phase angle", "Frequency", "Fault ground\nResistance Ω"], axis=1, inplace=True)
df

Unnamed: 0,FaultLoc,FaultType,MaxCoefA,MaxCoefB,MaxCoefC,MaxCoefGnd,FaultA,FaultB,FaultC,FaultGnd
0,Before transmission line,ABC-G,16097055.71,40725475.36,16097055.71,79893.30,1,1,1,1
1,Before transmission line,ABC,16097055.71,40725475.36,16097055.71,0.01,1,1,1,0
2,Before transmission line,AB-G,10803035.42,20464068.14,103.98,86421.48,1,1,0,1
3,Before transmission line,AC-G,20261370.35,103.98,8624427.56,202614.07,1,0,1,1
4,Before transmission line,BC-G,103.98,40725475.36,7176136.72,99363.73,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
235,After transmission line,B-C,115.72,594.60,735.53,0.01,0,1,1,0
236,After transmission line,A-G,852.78,75.29,102.68,778.38,1,0,0,1
237,After transmission line,B-G,114.87,421.49,107.07,659.07,0,1,0,1
238,After transmission line,C-G,131.07,131.07,846.38,591.62,0,0,1,1


In [3]:
#2.1.1-2 Data View, Data Cleaning
cats=["FaultLoc","FaultType","FaultA","FaultB","FaultC","FaultGnd"]
nums=[i for i in df.columns if i not in cats]

In [4]:
#2.1.3 Data Selection, 2.1.4 Data Formatting

# Encode categorical variables
le_FaultLoc = LabelEncoder()
df["FaultLoc"] = le_FaultLoc.fit_transform(df["FaultLoc"].values)
FaultLocMap = dict(zip(le_FaultLoc.transform(le_FaultLoc.classes_), le_FaultLoc.classes_))
print("FaultLoc:", FaultLocMap)

le_FaultType = LabelEncoder()
df["FaultType"] = le_FaultType.fit_transform(df["FaultType"].values)
FaultTypeMap = dict(zip(le_FaultType.transform(le_FaultType.classes_), le_FaultType.classes_))
print("FaultType:", FaultTypeMap)

# # Prepare the multilabel targets
# y_multilabel = df[["Target", "Failure Type"]].values
# X = df.drop(columns=["Target", "Failure Type"]).values

KeyError: 'Type'

In [None]:

#2.1.3 Data Selection

In [None]:
#2.1.4 Data Formatting

In [None]:
#2.1.5 Feature and Target Selection

In [None]:
#2.2 Splitting the Data

In [None]:
#2.3 Scaling

#### **Section 3: Create Dataset & DataLoader**

In [None]:
#3.1 Create Dataset and DataLoader

#### **Section 4 : The Architecture**

In [None]:
#4.1 Network Architectures

#### **Section 5: Model Training**

1. Model Preparation
   1. Model
   2. Criterio
   3. Optimizer

2. Model Training
3. Training History Plot

In [None]:
#5.1 Model Preparation

In [None]:
#5.2 Model Training

In [None]:
#5.3 History of training and validation loss

In [None]:
#5.3 History of training and validation loss

#### **Section 6: Testing and Evaluation**

1. Load Best Model
2. Calculate Metrics Performance
3. Visualization

##### Classification Metrics

##### 1. R-Square (Coefficient of Determination)
$R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2}$

- $y_i$: Actual values
- $\hat{y}_i$: Predicted values
- $\bar{y}$: Mean of the actual values

##### 2. Accuracy
Measures the ratio of correctly predicted instances out of the total instances:

$\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}$
- TP: True Positives
- TN: True Negatives
- FP: False Positives
- FN: False Negatives
  
##### 3. Precision
Ratio of true positive predictions to the total predicted positives:

$\text{Precision} = \frac{TP}{TP + FP}$

##### 4. Recall
Ratio of true positive predictions to the total actual positives:

$\text{Recall} = \frac{TP}{TP + FN}$

##### 5. F1 Score
Harmonic mean of Precision and Recall:

$F1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}$

##### Regression Metrics

##### 1. R-Square (Coefficient of Determination)
$R^2 = 1 - \frac{\sum (y_i - \hat{y}_i)^2}{\sum (y_i - \bar{y})^2}$

- $y_i$: Actual values
- $\hat{y}_i$: Predicted values
- $\bar{y}$: Mean of the actual values
- 
##### 2. Mean Absolute Error (MAE)
Average of the absolute differences between actual and predicted values:

$\text{MAE} = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i|$

- $n$: Number of observations

##### 3. Mean Square Error (MSE)
Average of the squared differences between actual and predicted values:

$\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$

- $n$: Number of observations

In [None]:
#6.1-6.2 Load the best model, Calculate Metrics Performance

In [None]:
#6.3 Visualization

#### **Section 7: Saving Data Prediction to CSV**