##  MinMax Scaling
### How it works: 
Scales each sample (row) to have unit norm (e.g., Euclidean distance of 1).
#### Formula: L2 norm
X_scaled = X / ||X||
#### Purpose:

##### Used when the direction of the data matters more than the magnitude (e.g., text classification, clustering).
##### Not as common in general machine learning tasks compared to the other two methods.

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv("C:/Users/DilshodbekMX/PycharmProjects/Cyber/cleaned_data.csv", low_memory=False)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,719755,43954,1498,52041,3679,443,6,9798,60563280,...,20,137317.5,163232.777429,470515.0,70629.0,9956533.0,106214.9,10007210.0,9742028.0,1
1,1,4935948,32886,1803,51857,1361,53,17,8551,1823,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,254968,11299,2788,34202,1308,443,6,3329,116047291,...,0,67765.083333,90368.037567,354722.0,41489.0,9602842.0,1386646.0,10018221.0,5199876.0,1
3,3,2350397,85811,1873,80,1999,37762,6,6669,4437256,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,3522931,1371,1067,52081,1590,3389,6,36034,1884423,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [16]:
import numpy as np
from sklearn.preprocessing import Normalizer

In [17]:
# Backup df
df_copy = df.copy()
# Create a MinMaxScaler object
normalizer = Normalizer()

# Normalize the data
normalized_data = pd.DataFrame(normalizer.fit_transform(df.iloc[:, :-1]), columns=df.columns[:-1])

# Add the Label column back to the normalized data
normalized_data['Label'] = df['Label']

# Display the normalized data
normalized_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.006649,0.000406,1.4e-05,0.000481,3.4e-05,4e-06,5.542772e-08,9.1e-05,0.559481,...,1.847591e-07,0.001269,0.001508,0.004347,0.000652,0.091978,0.000981,0.092446,0.089996,1
1,2.025488e-07,0.99977,0.006661,0.000365,0.010504,0.000276,1.1e-05,3.44333e-06,0.001732,0.000369,...,1.620391e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1.001782e-08,0.001277,5.7e-05,1.4e-05,0.000171,7e-06,2e-06,3.005347e-08,1.7e-05,0.581271,...,0.0,0.000339,0.000453,0.001777,0.000208,0.0481,0.006946,0.05018,0.026046,1
3,3.091427e-07,0.242203,0.008843,0.000193,8e-06,0.000206,0.003891,6.182854e-07,0.000687,0.457248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7.810609e-07,0.687906,0.000268,0.000208,0.01017,0.00031,0.000662,1.171591e-06,0.007036,0.367962,...,3.905304e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [18]:
import numpy as np
# Check for infinite values in each column
for col in normalized_data.columns:
    if normalized_data[col].isin([np.inf, -np.inf]).any():
        print(f"Column '{col}' contains infinite values.")

In [19]:
# Replace infinite or very large values with NaN
normalized_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [20]:
# Check for NaN values after replacing infinities
for col in normalized_data.columns:
    if normalized_data[col].isna().any():
        print(f"Column '{col}' contains NaN values.")

# Drop NaN values if needed
normalized_data.dropna(inplace=True)
print("drop Nan")
normalized_data.info()

drop Nan
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172009 entries, 0 to 172008
Data columns (total 78 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0.1       172009 non-null  float64
 1   Unnamed: 0         172009 non-null  float64
 2   Flow ID            172009 non-null  float64
 3   Src IP             172009 non-null  float64
 4   Src Port           172009 non-null  float64
 5   Dst IP             172009 non-null  float64
 6   Dst Port           172009 non-null  float64
 7   Protocol           172009 non-null  float64
 8   Timestamp          172009 non-null  float64
 9   Flow Duration      172009 non-null  float64
 10  Tot Fwd Pkts       172009 non-null  float64
 11  Tot Bwd Pkts       172009 non-null  float64
 12  TotLen Fwd Pkts    172009 non-null  float64
 13  TotLen Bwd Pkts    172009 non-null  float64
 14  Fwd Pkt Len Max    172009 non-null  float64
 15  Fwd Pkt Len Min    172009 non-null  float6

In [21]:
normalized_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.006649,0.000406,1.4e-05,0.000481,3.4e-05,4e-06,5.542772e-08,9.1e-05,0.559481,...,1.847591e-07,0.001269,0.001508,0.004347,0.000652,0.091978,0.000981,0.092446,0.089996,1
1,2.025488e-07,0.99977,0.006661,0.000365,0.010504,0.000276,1.1e-05,3.44333e-06,0.001732,0.000369,...,1.620391e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1.001782e-08,0.001277,5.7e-05,1.4e-05,0.000171,7e-06,2e-06,3.005347e-08,1.7e-05,0.581271,...,0.0,0.000339,0.000453,0.001777,0.000208,0.0481,0.006946,0.05018,0.026046,1
3,3.091427e-07,0.242203,0.008843,0.000193,8e-06,0.000206,0.003891,6.182854e-07,0.000687,0.457248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,7.810609e-07,0.687906,0.000268,0.000208,0.01017,0.00031,0.000662,1.171591e-06,0.007036,0.367962,...,3.905304e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [22]:
normalized_data.to_csv("NormalizerScaling.csv")