In [None]:
import pandas as pd
from urllib.parse import urlparse, unquote

# Load dataset
df = pd.read_csv('../data/csic_database.csv')

# Print original columns
print("Original columns:", df.columns.tolist())

# Step 0: Drop columns with >90% missing values
missing_ratio = df.isna().mean()
threshold = 0.7
columns_to_drop = missing_ratio[missing_ratio > threshold].index
df = df.drop(columns=columns_to_drop)
print(f"Dropped columns with >{threshold*100}% missing values: {list(columns_to_drop)}")

# Step 1: Remove Unnamed Column
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

# Step 2: Determine normal host (for reference, not adding as a feature)
def get_host(url):
    return urlparse(str(url)).netloc
normal_host = df['URL'].apply(get_host).mode()[0]
print(f"Most common host (for reference): {normal_host}")

# Step 3: Rename 'lenght' to 'length' if it exists and was not dropped
if 'lenght' in columns_to_drop:
    print("'lenght' column was dropped due to high missing values and cannot be renamed.")
elif 'lenght' in df.columns:
    df = df.rename(columns={'lenght': 'length'})
else:
    print("'lenght' column does not exist in the DataFrame.")

# Step 4: Standardize 'connection'
if 'connection' in df.columns:
    df['connection'] = df['connection'].str.replace('Connection: close', 'close', case=False).str.strip()

# Step 5: Decode encoded data in URL
df['URL'] = df['URL'].apply(lambda x: unquote(str(x)) if pd.notna(x) else x)

# Step 6: Normalize remaining missing values
df = df.fillna('')

# Step 7: Remove duplicates
# Step 8: Validate data types
if 'length' in df.columns:
    df['length'] = pd.to_numeric(df['length'], errors='coerce').fillna(0).astype(int)
else:
    print("'length' column does not exist, skipping validation for this column.")
df['classification'] = df['classification'].astype(int)

# Save cleaned dataset
df.to_csv('../data/fully_cleaned_dataset_no_anomaly.csv', index=False)
print(f"Cleaned dataset saved. Original rows: {len(pd.read_csv('../data/csic_database.csv'))}, After cleaning: {len(df)}")
df = pd.read_csv('../data/fully_cleaned_dataset_no_anomaly.csv')
print("Cleaned dataset columns:", df.columns.tolist())

Original columns: ['Unnamed: 0', 'Method', 'User-Agent', 'Pragma', 'Cache-Control', 'Accept', 'Accept-encoding', 'Accept-charset', 'language', 'host', 'cookie', 'content-type', 'connection', 'lenght', 'content', 'classification', 'URL']
Dropped columns with >70.0% missing values: ['content-type', 'lenght', 'content']
Most common host (for reference): localhost:8080
'lenght' column was dropped due to high missing values and cannot be renamed.
'length' column does not exist, skipping validation for this column.
Cleaned dataset saved. Original rows: 61065, After cleaning: 61065
Cleaned dataset columns: ['Method', 'User-Agent', 'Pragma', 'Cache-Control', 'Accept', 'Accept-encoding', 'Accept-charset', 'language', 'host', 'cookie', 'connection', 'classification', 'URL']
