###  MinMax Scaling
#### How it works: Scales features to a specified range, typically 0 to 1.\n
##### Formula:
X_scaled = (X - X.min()) / (X.max() - X.min())
##### Purpose:
Useful when you want all features to have the same scale, but you also want to preserve the relationships between values within each feature.
Good for algorithms sensitive to feature magnitudes (e.g., k-Nearest Neighbors, Neural Networks).
Doesn't change the shape of the original distribution.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("C:/Users/DilshodbekMX/PycharmProjects/Cyber/cleaned_data.csv", low_memory=False)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,719755,43954,1498,52041,3679,443,6,9798,60563280,...,20,137317.5,163232.777429,470515.0,70629.0,9956533.0,106214.9,10007210.0,9742028.0,1
1,1,4935948,32886,1803,51857,1361,53,17,8551,1823,...,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,254968,11299,2788,34202,1308,443,6,3329,116047291,...,0,67765.083333,90368.037567,354722.0,41489.0,9602842.0,1386646.0,10018221.0,5199876.0,1
3,3,2350397,85811,1873,80,1999,37762,6,6669,4437256,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,3522931,1371,1067,52081,1590,3389,6,36034,1884423,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [14]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [22]:
# Backup df
df_copy = df.copy()
# Create a MinMaxScaler object
normalizer = MinMaxScaler(feature_range=(0, 1))

# Normalize the data
normalized_data = pd.DataFrame(normalizer.fit_transform(df.iloc[:, :-1]), columns=df.columns[:-1])

# Add the Label column back to the normalized data
normalized_data['Label'] = df['Label']

# Display the normalized data
normalized_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.09108,0.328101,0.239987,0.794107,0.604403,0.00676,0.352941,0.246193,0.504694,...,0.454545,0.001253,0.002184,0.004293,0.000644,0.083017,0.001615,0.083439,0.081228,1
1,6e-06,0.624611,0.245482,0.28885,0.791299,0.223591,0.000809,1.0,0.21486,1.5e-05,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1.2e-05,0.032264,0.084343,0.446652,0.521897,0.214884,0.00676,0.352941,0.083647,0.967061,...,0.0,0.000618,0.001209,0.003237,0.000379,0.080068,0.021086,0.083531,0.043356,1
3,1.7e-05,0.297427,0.640548,0.300064,0.001221,0.328405,0.57622,0.352941,0.167571,0.036977,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2.3e-05,0.445803,0.010234,0.170939,0.794717,0.261212,0.051714,0.352941,0.905422,0.015704,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [26]:
import numpy as np
# Check for infinite values in each column
for col in normalized_data.columns:
    if normalized_data[col].isin([np.inf, -np.inf]).any():
        print(f"Column '{col}' contains infinite values.")

In [27]:
# Replace infinite or very large values with NaN
normalized_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [28]:
# Check for NaN values after replacing infinities
for col in normalized_data.columns:
    if normalized_data[col].isna().any():
        print(f"Column '{col}' contains NaN values.")

# Drop NaN values if needed
normalized_data.dropna(inplace=True)
print("drop Nan")
normalized_data.info()

drop Nan
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172009 entries, 0 to 172008
Data columns (total 78 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0.1       172009 non-null  float64
 1   Unnamed: 0         172009 non-null  float64
 2   Flow ID            172009 non-null  float64
 3   Src IP             172009 non-null  float64
 4   Src Port           172009 non-null  float64
 5   Dst IP             172009 non-null  float64
 6   Dst Port           172009 non-null  float64
 7   Protocol           172009 non-null  float64
 8   Timestamp          172009 non-null  float64
 9   Flow Duration      172009 non-null  float64
 10  Tot Fwd Pkts       172009 non-null  float64
 11  Tot Bwd Pkts       172009 non-null  float64
 12  TotLen Fwd Pkts    172009 non-null  float64
 13  TotLen Bwd Pkts    172009 non-null  float64
 14  Fwd Pkt Len Max    172009 non-null  float64
 15  Fwd Pkt Len Min    172009 non-null  float6

In [29]:
normalized_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.09108,0.328101,0.239987,0.794107,0.604403,0.00676,0.352941,0.246193,0.504694,...,0.454545,0.001253,0.002184,0.004293,0.000644,0.083017,0.001615,0.083439,0.081228,1
1,6e-06,0.624611,0.245482,0.28885,0.791299,0.223591,0.000809,1.0,0.21486,1.5e-05,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1.2e-05,0.032264,0.084343,0.446652,0.521897,0.214884,0.00676,0.352941,0.083647,0.967061,...,0.0,0.000618,0.001209,0.003237,0.000379,0.080068,0.021086,0.083531,0.043356,1
3,1.7e-05,0.297427,0.640548,0.300064,0.001221,0.328405,0.57622,0.352941,0.167571,0.036977,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2.3e-05,0.445803,0.010234,0.170939,0.794717,0.261212,0.051714,0.352941,0.905422,0.015704,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [30]:
normalized_data.to_csv("MinMaxScaling.csv")