In [None]:
import time
from pathlib import Path
import pandas as pd  # ← make sure this is imported
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Define the root directory
root_dir = Path(r"server")

# Store all dataframes in a list
df_list = []
file_count = 0
skipped_count = 0
max_files = 1000

# Initialize batch timer
total_initial = time.time()
batch_start_time = time.time()

# Loop through all CSV files in all subdirectories
for file_path in root_dir.rglob("*.csv"):
    if file_count >= max_files:
        break  # Stop after 1000 files

    try:
        temp_df = pd.read_csv(
            file_path, delimiter="\t", header=None, on_bad_lines="skip"
        )
        label_folder = file_path.parent.name
        temp_df["label"] = label_folder
        df_list.append(temp_df)
        file_count += 1

        # After every 50 files, print status and time
        if file_count % 50 == 0:
            batch_duration = time.time() - batch_start_time
            print(
                f"✔ Processed {file_count} files... (last 50 in {batch_duration:.2f} seconds)"
            )
            batch_start_time = time.time()  # Reset batch timer

    except Exception as e:
        skipped_count += 1
        print(f"⚠️ Skipped {file_path.name}: {e}")

# One-time concat — much faster
df = pd.concat(df_list, ignore_index=True)

# Print summary
total_duration = time.time() - total_initial
print(f"\n✅ Loaded data from {file_count} files.")
print(f"⏱ Total processing time: {total_duration:.2f} seconds")
if skipped_count > 0:
    print(f"⚠️ Skipped {skipped_count} files due to read errors.")
print(f"📊 Final dataset shape: {df.shape}")

# Prepare features/labels
X = df.drop("label", axis=1)
y = df["label"]


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))