In [None]:
from google.colab import files
uploaded = files.upload()
import zipfile
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Extrage ZIP-ul
with zipfile.ZipFile('datasets_with_attacks.zip', 'r') as zip_ref:
    zip_ref.extractall('datasets_with_attacks')

print("Conținutul folderului după extragere:")
print(os.listdir('datasets_with_attacks'))

In [1]:


# Funcție pentru găsirea automată a fișierelor CSV relevante
def find_vehicle_csv_files(base_directory):
    """Găsește automat fișierele CSV pentru vehicule"""
    csv_files = {}

    # Caută în toate subdirectoarele
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            if file.endswith('.csv') and 'Vehicle' in file:
                # Încearcă să determine numele vehiculului din numele fișierului
                if 'Vehicle A' in file:
                    vehicle_name = 'Vehicle A'
                elif 'Vehicle B' in file:
                    vehicle_name = 'Vehicle B'
                elif 'Vehicle C' in file:
                    vehicle_name = 'Vehicle C'
                else:
                    # Dacă nu găsim un pattern standard, folosim numele fișierului
                    vehicle_name = file.split(' - ')[0] if ' - ' in file else file.replace('.csv', '')

                csv_files[vehicle_name] = os.path.join(root, file)
                print(f"Găsit: {vehicle_name} -> {os.path.join(root, file)}")

    return csv_files

# Găsește fișierele
vehicle_files = find_vehicle_csv_files('datasets_with_attacks')

# Funcție pentru încărcarea dataset-urilor
def load_datasets(vehicle_files):
    """Încarcă toate dataset-urile vehiculelor"""
    dfs = {}

    for vehicle_name, file_path in vehicle_files.items():
        try:
            print(f"\nÎncarc {vehicle_name} din {file_path}")

            # Încearcă diferite delimitatoare
            df = None
            for delimiter in [';', ',', '\t']:
                try:
                    df = pd.read_csv(file_path, delimiter=delimiter)
                    print(f"Succes cu delimitatorul '{delimiter}'")
                    break
                except:
                    continue

            if df is not None:
                # Redenumește coloanele
                df.columns = [f'Column{i}' for i in range(len(df.columns))]
                dfs[vehicle_name] = df
                print(f"Forme dataset: {df.shape}")
                print(f"Coloane: {list(df.columns)}")
            else:
                print(f"Nu s-a putut încărca {vehicle_name}")

        except Exception as e:
            print(f"Eroare la încărcarea {vehicle_name}: {e}")

    return dfs

# Încarcă dataset-urile
dfs = load_datasets(vehicle_files)

print(f"\nÎncărcate cu succes: {len(dfs)} dataset-uri")
for vehicle, df in dfs.items():
    print(f"  {vehicle}: {df.shape}")

# Funcție pentru echilibrarea datelor
def downsample_no_attack(df, label_col='Column13', no_attack_label=0, fraction=0.7):
    if df is None:
        return None
    no_attack_rows = df[df[label_col] == no_attack_label]
    attack_rows = df[df[label_col] != no_attack_label]

    # Randomly sample 60% (i.e., keep 60%) of no-attack rows
    no_attack_downsampled = no_attack_rows.sample(frac=1 - fraction, random_state=42)

    # Combine with all attack rows
    downsampled_df = pd.concat([no_attack_downsampled, attack_rows]).reset_index(drop=True)
    return downsampled_df

# Aplică echilibrarea pentru fiecare dataset
df_a = dfs.get('Vehicle A')
df_b = dfs.get('Vehicle B')
df_c = dfs.get('Vehicle C')

df_a = downsample_no_attack(df_a) if df_a is not None else None
df_b = downsample_no_attack(df_b) if df_b is not None else None
df_c = downsample_no_attack(df_c) if df_c is not None else None

# Funcție pentru afișarea informațiilor despre dataset
def print_info(df, name):
    if df is not None:
        total = len(df)
        counts = df['Column13'].value_counts(normalize=True)
        print(f"\n{name}:")
        print(f"Total entries: {total}")
        print(f"Label proportions:\n{counts}")
    else:
        print(f"{name} dataframe not loaded.")

print_info(df_a, "Vehicle A")
print_info(df_b, "Vehicle B")
print_info(df_c, "Vehicle C")

# Adaugă identificatori pentru vehicule
if df_a is not None:
    df_a['Vehicle'] = 0
if df_b is not None:
    df_b['Vehicle'] = 1
if df_c is not None:
    df_c['Vehicle'] = 2

# Combină toate dataset-urile
df_all_3 = pd.concat([df_a, df_b, df_c], ignore_index=True)

print(f"\nDataset combinat: {df_all_3.shape}")
print(f"Distribuția etichetelor:")
print(df_all_3['Column13'].value_counts(normalize=True))

# Amestecă datele
df_all_3 = df_all_3.sample(frac=1, random_state=42).reset_index(drop=True)

# Definește caracteristicile și ținta
X = df_all_3.drop(columns=['Column13'])
y = df_all_3['Column13']

# Împarte datele 80-20 stratificat
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution: {pd.Series(y_train).value_counts(normalize=True)}")

# Normalizează caracteristicile
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Datele au fost normalizate.")

# Aplică SMOTE pentru echilibrarea claselor în setul de antrenament
print("\nAplicare SMOTE pentru echilibrarea claselor...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"După SMOTE - X_train shape: {X_train_smote.shape}")
print(f"După SMOTE - y_train distribution: {pd.Series(y_train_smote).value_counts(normalize=True)}")

# Dicționar pentru stocarea rezultatelor
results = {}

# Funcție pentru antrenarea și evaluarea modelelor
def train_and_evaluate_model(name, model, X_train, y_train, X_test, y_test):
    """Antrenează și evaluează un model"""
    print(f"\n{'='*50}")
    print(f"Antrenare model: {name}")
    print(f"{'='*50}")

    # Antrenare
    model.fit(X_train, y_train)

    # Predicții
    y_pred = model.predict(X_test)

    # Calculează metricile
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    # Calculează AUC dacă modelul suportă predict_proba
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except:
        auc = None

    # Afișează rezultatele
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    if auc is not None:
        print(f"AUC:       {auc:.4f}")

    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Stochează rezultatele
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc,
        'y_pred': y_pred,
        'y_proba': y_proba if auc is not None else None
    }

    return model

print("\n" + "="*70)
print("ANTRENAREA MODELELOR DE MACHINE LEARNING")
print("="*70)

# 1. Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=2000)
train_and_evaluate_model("Logistic Regression", lr, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model("Random Forest", rf, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 3. Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model("Gradient Boosting", gb, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 4. Support Vector Machine
svm = SVC(kernel='rbf', probability=True, random_state=42)
train_and_evaluate_model("SVM", svm, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 5. K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
train_and_evaluate_model("KNN", knn, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 6. Decision Tree
dt = DecisionTreeClassifier(random_state=42)
train_and_evaluate_model("Decision Tree", dt, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 7. Naive Bayes
nb = GaussianNB()
train_and_evaluate_model("Naive Bayes", nb, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 8. AdaBoost
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model("AdaBoost", ada, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 9. Extra Trees
et = ExtraTreesClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model("Extra Trees", et, X_train_smote, y_train_smote, X_test_scaled, y_test)

# 10. Neural Network (MLP)
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
train_and_evaluate_model("Neural Network", mlp, X_train_smote, y_train_smote, X_test_scaled, y_test)

# Comparația finală a tuturor modelelor
print("\n" + "="*70)
print("COMPARAȚIA FINALĂ A MODELELOR")
print("="*70)

# Creează un DataFrame pentru comparație
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'Precision': result['precision'],
        'Recall': result['recall'],
        'F1-Score': result['f1_score'],
        'AUC': result['auc'] if result['auc'] is not None else 'N/A'
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("\nRezultatele finale (sortate după F1-Score):")
print(comparison_df.to_string(index=False, float_format='%.4f'))

# Găsește cel mai bun model
best_model_name = comparison_df.iloc[0]['Model']
best_model_result = results[best_model_name]

print(f"\n🏆 CEL MAI BUN MODEL: {best_model_name}")
print(f"F1-Score: {best_model_result['f1_score']:.4f}")
print(f"Accuracy: {best_model_result['accuracy']:.4f}")
print(f"Precision: {best_model_result['precision']:.4f}")
print(f"Recall: {best_model_result['recall']:.4f}")

# Funcție pentru optimizarea hiperparametrilor
def optimize_best_models():
    """Optimizează hiperparametrii pentru cele mai bune 3 modele"""
    print("\n" + "="*70)
    print("OPTIMIZAREA HIPERPARAMETRILOR")
    print("="*70)

    # Selectează top 3 modele
    top_3_models = comparison_df.head(3)['Model'].tolist()

    optimized_results = {}

    for model_name in top_3_models:
        print(f"\nOptimizare pentru {model_name}...")

        if model_name == "Random Forest":
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
            model = RandomForestClassifier(random_state=42)

        elif model_name == "Gradient Boosting":
            param_grid = {
                'n_estimators': [50, 100, 150],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
            model = GradientBoostingClassifier(random_state=42)

        elif model_name == "Extra Trees":
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5]
            }
            model = ExtraTreesClassifier(random_state=42)

        else:
            continue

        # Grid Search
        grid_search = GridSearchCV(
            model, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=0
        )

        grid_search.fit(X_train_smote, y_train_smote)

        # Evaluare model optimizat
        best_model = grid_search.best_estimator_
        y_pred_opt = best_model.predict(X_test_scaled)

        f1_opt = f1_score(y_test, y_pred_opt)
        accuracy_opt = accuracy_score(y_test, y_pred_opt)
        precision_opt = precision_score(y_test, y_pred_opt)
        recall_opt = recall_score(y_test, y_pred_opt)

        print(f"Cei mai buni parametri: {grid_search.best_params_}")
        print(f"F1-Score optimizat: {f1_opt:.4f} (îmbunătățire: {f1_opt - results[model_name]['f1_score']:+.4f})")

        optimized_results[model_name] = {
            'model': best_model,
            'f1_score': f1_opt,
            'accuracy': accuracy_opt,
            'precision': precision_opt,
            'recall': recall_opt,
            'best_params': grid_search.best_params_
        }

    return optimized_results

# Rulează optimizarea
optimized_results = optimize_best_models()

# Cross-validation pentru validarea rezultatelor
print("\n" + "="*70)
print("VALIDAREA CROSS-VALIDATION")
print("="*70)

def cross_validate_models():
    """Efectuează cross-validation pentru modelele principale"""
    cv_results = {}

    models_to_cv = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42)
    }

    for name, model in models_to_cv.items():
        print(f"\nCross-validation pentru {name}...")

        # 5-fold stratified cross-validation
        cv_scores = cross_val_score(model, X_train_smote, y_train_smote,
                                  cv=5, scoring='f1', n_jobs=-1)

        print(f"F1-Score CV: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        print(f"Scoruri individuale: {[f'{score:.4f}' for score in cv_scores]}")

        cv_results[name] = {
            'mean_f1': cv_scores.mean(),
            'std_f1': cv_scores.std(),
            'scores': cv_scores
        }

    return cv_results

cv_results = cross_validate_models()

# Analiza importanței caracteristicilor pentru cel mai bun model
def analyze_feature_importance():
    """Analizează importanța caracteristicilor pentru modelele bazate pe arbori"""
    print("\n" + "="*70)
    print("ANALIZA IMPORTANȚEI CARACTERISTICILOR")
    print("="*70)

    tree_models = ['Random Forest', 'Gradient Boosting', 'Extra Trees', 'Decision Tree']

    for model_name in tree_models:
        if model_name in results:
            model = results[model_name]['model']
            if hasattr(model, 'feature_importances_'):
                print(f"\n{model_name} - Top 10 caracteristici importante:")

                feature_names = X.columns
                importances = model.feature_importances_

                # Creează DataFrame pentru importanțe
                importance_df = pd.DataFrame({
                    'Feature': feature_names,
                    'Importance': importances
                }).sort_values('Importance', ascending=False)

                print(importance_df.head(10).to_string(index=False, float_format='%.4f'))

analyze_feature_importance()

print("\n" + "="*70)
print("ANALIZA COMPLETĂ FINALIZATĂ!")
print("="*70)
print(f"✅ Au fost antrenate și evaluate {len(results)} modele de ML")
print(f"🏆 Cel mai bun model: {best_model_name} (F1-Score: {best_model_result['f1_score']:.4f})")
print(f"📊 Cross-validation efectuată pentru modelele principale")
print(f"⚙️  Hiperparametrii optimizați pentru top 3 modele")
print(f"📈 Analiza importanței caracteristicilor completată")
print("\nRezultatele sunt gata pentru lucrarea de master! 🎓")

FileNotFoundError: [Errno 2] No such file or directory: 'datasets_with_attacks.zip'