In [None]:
pip install seaborn polars pandas numpy scikit-learn ydata_profiling -U -q #u = update , q = reduce output displayed

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import os


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pl.read_csv("/kaggle/input/csai-253-project-phase-2/train.csv")

In [None]:
"""
Id: Unique identifier for each row in the test set.
flow_time: Duration of network flow.
header_size: Size of the packet headers.
packet_duration: Duration of individual packets.
overall_rate: Overall data transfer rate.
src_rate: Data transfer rate from the source.
dst_rate: Data transfer rate towards the destination.
fin_packets: Number of packets flagged as finished (FIN).
urg_packets: Number of urgent packets (URG).
rst_packets: Number of reset packets (RST).
max_value: Maximum value observed in packet data.
syn_flags: Number of SYN (synchronize) flags detected.
rst_flags: Number of RST (reset) flags detected.
psh_flags: Number of PSH (push) flags detected.
ack_flags: Number of ACK (acknowledge) flags detected.
protocol_http: Indicates HTTP protocol usage.
protocol_https: Indicates HTTPS protocol usage.
protocol_tcp: Indicates TCP protocol usage.
protocol_udp: Indicates UDP protocol usage.
protocol_icmp: Indicates ICMP protocol usage.
label (only in train.csv): Type of cyber-attack to be predicted (e.g., DDoS, DoS, benign).
"""
df.schema

In [None]:
df.describe()

In [None]:
#how may classes do we have?
print(df["label"].unique().value_counts().sum()["count"][0] ," unique labels")
df = df.with_columns(pl.col("label").str.to_lowercase()) # case folding , expression object: pl.col("label")
#Are the classes un-balanced
quickOVERVIEW = df["label"].value_counts().with_columns((pl.col("count")/df["label"].shape[0] *100).alias("percentage"))
quickOVERVIEW

In [None]:
plt.figure(figsize=(20, 5), dpi=500)
plt.bar(
    quickOVERVIEW["label"], 
    height=quickOVERVIEW["percentage"] , 
    color= quickOVERVIEW["label"].replace({
    "ddos": "red",             
    "recon": "blue",           
    "benigntraffic": "green", 
    "mitm": "orange",          
    "dos": "purple",          
    "mirai": "cyan"           
}
))

In [None]:
profile = ProfileReport(df.to_pandas(), title="Profiling Report" , explorative=True , lazy= True , minimal = True)
# Display the report (Kaggle should automatically render it)
profile.to_notebook_iframe()

In [None]:
plt.figure(figsize=(20, 5), dpi=500)

sns.boxplot(
    data=ScaledFeatures,#numeric columns only
    showfliers=False,  # hide extreme outliers
    boxprops=dict(facecolor="lightblue", linewidth=2),  # box styling
    whiskerprops=dict(color="blue", linewidth=2, linestyle="--"),
    medianprops=dict(color="darkred", linewidth=2),
    capprops=dict(color="blue", linewidth=2),
    # gap=1.5
)

plt.show()

In [None]:
df[[col for i, col in enumerate(df.columns) if i == 0 or i==3 or i==4 or i==9]].describe()

In [None]:
ScaledFeatures =  StandardScaler().fit_transform(df.select(cs.numeric()))
df = pl.concat(items = [pl.DataFrame(ScaledFeatures , schema = df.select(cs.numeric()).columns) , df.select("label")] ,  how="horizontal")
d

In [None]:
#feature selection 
selector = SelectKBest(score_func=f_classif, k=10)

selector.fit(df.select(cs.numeric()), df["label"])

selected_indices = selector.get_support(indices=True)
print(selected_indices , len(selected_indices),"features")
selected_features = [df.select(cs.numeric()).columns[i] for i in selected_indices]
print("\nSelected features: \n", selected_features)

In [None]:
# not as we did in phase 1 , we will be testing undersampling and oversampling, 
# validated my idea by generative pre-trained transformers

"""You're dealing with a pretty classic **class imbalance problem**, especially with `"ddos"` being massively overrepresented and `"benigntraffic"`, `"mitm"`, and `"recon"` heavily underrepresented.

Here's a breakdown of your idea, and some suggestions:

---

### ✅ Your Plan (Under + Over Sampling):
Using **undersampling on the majority ("ddos")** and then **oversampling the minority classes** is totally reasonable and commonly done. Here's how it might work:

#### Step-by-step version of your plan:
1. **Undersample "ddos"** to reduce its impact.
2. **Oversample "benigntraffic", "mitm", "recon"**, and maybe even "mirai" to get a more balanced dataset.
3. **Possibly leave "dos" as is**, or slightly adjust it.

---

### 🧠 Tips on Executing It:

#### 1. ⚖️ **Target Balance Strategy**
Decide your final desired sample size per class. Example ideas:
- Match to the size of "mirai" (~50K)
- Or go a bit lower to reduce training size, say ~30K per class
- Avoid exact balance — slight randomness can help generalization

#### 2. 🧹 **Undersampling "ddos"**
You can randomly select ~30K–50K rows from "ddos" (from 687K). Make sure to **shuffle** before picking.

#### 3. 🔁 **Oversampling Minorities**
Use techniques like:
- **Random Oversampling** (e.g., simple duplication)
- **SMOTE/ADASYN** (synthetic data generation, if the features support it — i.e., numeric and continuous)

> Note: SMOTE doesn’t work well with categorical data unless encoded carefully.

#### 4. 🧪 **Be Careful with Oversampling**
If you're evaluating on the same dataset, make sure not to **leak duplicated synthetic samples into your validation set** — only oversample in the training split.

---

### ⚖️ Alternative: Use Class Weights
If you're using tree-based models (like XGBoost, LightGBM) or neural nets, consider using **class weighting** instead. It avoids changing the actual data and simply adjusts the cost of misclassification.

---

### 👀 Example Target Distribution
| Label          | Target Count |
|----------------|--------------|
| ddos           | 30,000       |
| benigntraffic  | 30,000       |
| mitm           | 30,000       |
| mirai          | 30,000       |
| recon          | 30,000       |
| dos            | 30,000       |

You could tune this depending on whether you want full balance or slight realism.

---

### 📦 TL;DR
- Your under+over strategy is solid.
- Make sure not to leak synthetic samples into validation/test.
- Consider class weights as a simpler alternative if supported.
- Don’t overdo oversampling — can lead to overfitting if minority class is too tiny.

Would you like help with writing some code to implement this in Python (e.g., using `pandas`, `sklearn`, `imbalanced-learn`)?"""

In [None]:
#Just a tester


X_train, X_test, y_train, y_test = train_test_split(df.select(selected_features), df["label"], test_size=0.25, random_state=42)


rf_model = RandomForestClassifier(random_state=42, class_weight="balanced" , n_jobs=-1)  # class_weight="balanced" helps with imbalance
rf_model.fit(X_train, y_train)


y_pred = rf_model.predict(X_test)


print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
