In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [21]:
file_path = "original_data_ser.csv"
df = pd.read_csv(file_path)

In [22]:
print("Initial Dataset Info:\n")
df.info()
display(df.head())

Initial Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 10 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Current number of packets in queue      149250 non-null  float64
 1   Remaining space to store the packets    150000 non-null  int64  
 2   Current queue size in percentage (%)    150000 non-null  int64  
 3   Remaining queue size in percentage (%)  150000 non-null  int64  
 4   Old Average                             150000 non-null  object 
 5   Current Average                         150000 non-null  float64
 6   Prediction                              150000 non-null  int64  
 7   Queue Utilization Ratio                 150000 non-null  float64
 8   Packet Drop Probability                 150000 non-null  float64
 9   Congestion Level                        150000 non-null  object 
dtypes: float64(4), int64(

Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Remaining queue size in percentage (%),Old Average,Current Average,Prediction,Queue Utilization Ratio,Packet Drop Probability,Congestion Level
0,38.0,12,76,24,27.870661857309,28.158006,1,0.76,0.154205,High
1,28.0,22,56,44,2.73261646717438,2.692447,0,0.56,0.107559,Medium
2,14.0,36,28,72,21.6606174160322,21.892917,0,0.28,0.096912,Low
3,42.0,8,84,16,4.06378125743397,4.204988,1,0.84,0.102336,High
4,7.0,43,14,86,0.601011062510428,0.167125,0,0.14,0.149147,Low


In [23]:
df_original = df.copy()

In [24]:
print("\nStep 1: Handling Missing Values\n")



Step 1: Handling Missing Values



In [25]:
missing_rows = df[df['Current number of packets in queue'].isnull()]
print("Missing Rows (Before Dropping):\n")
display(missing_rows.head(5))


Missing Rows (Before Dropping):



Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Remaining queue size in percentage (%),Old Average,Current Average,Prediction,Queue Utilization Ratio,Packet Drop Probability,Congestion Level
51,,25,50,50,25.6145544668571,26.293161,0,0.5,0.125669,Medium
316,,33,34,66,17.9238416743728,18.613591,0,0.34,0.03918,Low
420,,6,88,12,6.88251951258622,6.352895,1,0.88,0.122765,High
492,,39,22,78,17.8450924570323,17.565701,0,0.22,0.074069,Low
520,,46,8,92,20.9511614589215,21.834688,0,0.08,0.019296,Low


In [26]:
df.dropna(subset=['Current number of packets in queue'], inplace=True)


In [27]:
print("Shape before dropping:", df_original.shape, "| Shape after dropping:", df.shape)


Shape before dropping: (150000, 10) | Shape after dropping: (149250, 10)


In [28]:
print("\nStep 2: Fixing Wrong Data Types\n")



Step 2: Fixing Wrong Data Types



In [29]:
print("Old Average before conversion (non-numeric sample):")
display(df_original['Old Average'].head(5))


Old Average before conversion (non-numeric sample):


0      27.870661857309
1     2.73261646717438
2     21.6606174160322
3     4.06378125743397
4    0.601011062510428
Name: Old Average, dtype: object

In [30]:
df['Old Average'] = pd.to_numeric(df['Old Average'], errors='coerce')


In [31]:
label_encoder = LabelEncoder()
df['Congestion Level'] = label_encoder.fit_transform(df['Congestion Level'])

In [32]:
print("Old Average after conversion:")
display(df['Old Average'].head(5))
print("\nEncoded 'Congestion Level' sample mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Old Average after conversion:


0    27.870662
1     2.732616
2    21.660617
3     4.063781
4     0.601011
Name: Old Average, dtype: float64


Encoded 'Congestion Level' sample mapping:
{'Hgh': 0, 'High': 1, 'Low': 2, 'Medium': 3}


In [33]:
print("\nStep 3: Removing Redundant Feature\n")



Step 3: Removing Redundant Feature



In [34]:
print("Before Dropping:")
display(df_original[['Current queue size in percentage (%)',
                     'Remaining queue size in percentage (%)']].head(5))

Before Dropping:


Unnamed: 0,Current queue size in percentage (%),Remaining queue size in percentage (%)
0,76,24
1,56,44
2,28,72
3,84,16
4,14,86


In [35]:
df.drop(columns=['Remaining queue size in percentage (%)'], inplace=True)


In [36]:
print("Columns after dropping:", df.columns.tolist())


Columns after dropping: ['Current number of packets in queue', 'Remaining space to store the packets', 'Current queue size in percentage (%)', 'Old Average', 'Current Average', 'Prediction', 'Queue Utilization Ratio', 'Packet Drop Probability', 'Congestion Level']


In [37]:
print("\nStep 4: Handling Outliers\n")



Step 4: Handling Outliers



In [38]:
def find_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]


In [39]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

In [40]:
outlier_cols = [
    'Queue Utilization Ratio',
    'Packet Drop Probability',
    'Current number of packets in queue'
]

In [41]:

for col in outlier_cols:
    # Find outliers before removal
    outliers = find_outliers_iqr(df, col)
    print(f"\nOutliers detected in '{col}': {len(outliers)}")
    if len(outliers) > 0:
        display(outliers[[col]].head(5))  # Show first 5 outliers
    
    before = len(df)
    df = remove_outliers_iqr(df, col)
    after = len(df)
    print(f"Removed {before - after} rows for '{col}'")


Outliers detected in 'Queue Utilization Ratio': 0
Removed 0 rows for 'Queue Utilization Ratio'

Outliers detected in 'Packet Drop Probability': 517


Unnamed: 0,Packet Drop Probability
790,0.235489
833,0.235848
1091,0.280542
2388,0.272691
2735,0.287755


Removed 517 rows for 'Packet Drop Probability'

Outliers detected in 'Current number of packets in queue': 0
Removed 0 rows for 'Current number of packets in queue'


In [42]:
print("\nStep 5: Fixing Inconsistent Data\n")



Step 5: Fixing Inconsistent Data



In [43]:
invalid_ratio = df[(df['Queue Utilization Ratio'] < 0) | (df['Queue Utilization Ratio'] > 1)]
invalid_prob = df[(df['Packet Drop Probability'] < 0) | (df['Packet Drop Probability'] > 1)]


In [44]:
print("Invalid Queue Utilization Ratio rows (if any):")
display(invalid_ratio.head(5))

Invalid Queue Utilization Ratio rows (if any):


Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Old Average,Current Average,Prediction,Queue Utilization Ratio,Packet Drop Probability,Congestion Level


In [45]:
print("Invalid Packet Drop Probability rows (if any):")
display(invalid_prob.head(5))

Invalid Packet Drop Probability rows (if any):


Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Old Average,Current Average,Prediction,Queue Utilization Ratio,Packet Drop Probability,Congestion Level


In [46]:
df['Queue Utilization Ratio'] = df['Queue Utilization Ratio'].clip(0, 1)
df['Packet Drop Probability'] = df['Packet Drop Probability'].clip(0, 1)

In [47]:
print("Sample after clipping:")
display(df[['Queue Utilization Ratio', 'Packet Drop Probability']].head(5))


Sample after clipping:


Unnamed: 0,Queue Utilization Ratio,Packet Drop Probability
0,0.76,0.154205
1,0.56,0.107559
2,0.28,0.096912
3,0.84,0.102336
4,0.14,0.149147


In [48]:
print("\nStep 6: Scaling Numeric Features\n")


Step 6: Scaling Numeric Features



In [49]:

numeric_cols = [
    'Current number of packets in queue',
    'Remaining space to store the packets',
    'Current queue size in percentage (%)',
    'Old Average',
    'Current Average',
    'Queue Utilization Ratio',
    'Packet Drop Probability'
]

In [50]:
print("Before Scaling:")
display(df[numeric_cols].head(5))

Before Scaling:


Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Old Average,Current Average,Queue Utilization Ratio,Packet Drop Probability
0,38.0,12,76,27.870662,28.158006,0.76,0.154205
1,28.0,22,56,2.732616,2.692447,0.56,0.107559
2,14.0,36,28,21.660617,21.892917,0.28,0.096912
3,42.0,8,84,4.063781,4.204988,0.84,0.102336
4,7.0,43,14,0.601011,0.167125,0.14,0.149147


In [51]:
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [52]:
print("After Scaling:")
display(df[numeric_cols].head(5))

After Scaling:


Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Old Average,Current Average,Queue Utilization Ratio,Packet Drop Probability
0,0.76,0.309091,0.506667,0.929032,0.912312,0.76,0.655254
1,0.56,0.490909,0.373333,0.091086,0.115102,0.56,0.457043
2,0.28,0.745455,0.186667,0.722028,0.716181,0.28,0.411801
3,0.84,0.236364,0.56,0.135459,0.162453,0.84,0.43485
4,0.14,0.872727,0.093333,0.020032,0.036046,0.14,0.63376


In [53]:
print("\nStep 7: Handling Label Issues\n")


Step 7: Handling Label Issues



In [54]:
print("Unique values in Prediction:", df['Prediction'].unique())
print("Unique values in Congestion Level:", df['Congestion Level'].unique())

Unique values in Prediction: [1 0]
Unique values in Congestion Level: [1 3 2 0]


In [55]:
df.reset_index(drop=True, inplace=True)


In [56]:
print("\nFinal Dataset Shape:", df.shape)
print("Final Dataset Info:")
df.info()
display(df.head())


Final Dataset Shape: (148733, 9)
Final Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148733 entries, 0 to 148732
Data columns (total 9 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Current number of packets in queue    148733 non-null  float64
 1   Remaining space to store the packets  148733 non-null  float64
 2   Current queue size in percentage (%)  148733 non-null  float64
 3   Old Average                           147988 non-null  float64
 4   Current Average                       148733 non-null  float64
 5   Prediction                            148733 non-null  int64  
 6   Queue Utilization Ratio               148733 non-null  float64
 7   Packet Drop Probability               148733 non-null  float64
 8   Congestion Level                      148733 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 10.2 MB


Unnamed: 0,Current number of packets in queue,Remaining space to store the packets,Current queue size in percentage (%),Old Average,Current Average,Prediction,Queue Utilization Ratio,Packet Drop Probability,Congestion Level
0,0.76,0.309091,0.506667,0.929032,0.912312,1,0.76,0.655254,1
1,0.56,0.490909,0.373333,0.091086,0.115102,0,0.56,0.457043,3
2,0.28,0.745455,0.186667,0.722028,0.716181,0,0.28,0.411801,2
3,0.84,0.236364,0.56,0.135459,0.162453,1,0.84,0.43485,1
4,0.14,0.872727,0.093333,0.020032,0.036046,0,0.14,0.63376,2


In [57]:
output_path = "cleaned_dataset.csv"
df.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved as: {output_path}")


Cleaned dataset saved as: cleaned_dataset.csv


In [58]:
print(f"\nThank You")


Thank You


In [59]:
import os
print("Current Working Directory:", os.getcwd())

Current Working Directory: /home/ankit


In [60]:
rows, cols = df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

Number of rows: 148733
Number of columns: 9
