<a href="https://colab.research.google.com/github/BreachFinder777/Booth-Algorithm/blob/main/Android%20Malware.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Android Malware Analysis using Machine Learning
# Author: Christian Camilo Urcuqui López (Modified and Enhanced)
# Date: June 2025
#
# This script performs Android malware detection using two approaches:
# 1. Static Analysis: Analyzing Android app permissions
# 2. Dynamic Analysis: Analyzing network traffic patterns
#
# Datasets:
# - Permissions: xwolf12/datasetandroidpermissions (train.csv)
# - Network Traffic: xwolf12/network-traffic-android-malware (android_traffic.csv)

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report, cohen_kappa_score
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

def load_datasets():
    """Download and return paths to the Kaggle datasets."""
    permissions_path = kagglehub.dataset_download('xwolf12/datasetandroidpermissions')
    traffic_path = kagglehub.dataset_download('xwolf12/network-traffic-android-malware')

    # Assume the files are named as in the notebook
    static_data_path = os.path.join(permissions_path, 'train.csv')
    dynamic_data_path = os.path.join(traffic_path, 'android_traffic.csv')

    if not os.path.exists(static_data_path):
        raise FileNotFoundError(f"Static dataset not found at {static_data_path}")
    if not os.path.exists(dynamic_data_path):
        raise FileNotFoundError(f"Dynamic dataset not found at {dynamic_data_path}")

    return static_data_path, dynamic_data_path

def perform_static_analysis(data_path):
    """
    Perform static analysis on Android apps by analyzing their permissions.

    Args:
        data_path (str): Path to the permissions dataset CSV file

    Returns:
        tuple: Training and testing data splits, original DataFrame
    """
    print("STATIC ANALYSIS: ANDROID PERMISSIONS")
    print("-" * 50)

    try:
        df = pd.read_csv(data_path, sep=";")
        print(f"Dataset loaded: {df.shape[0]} apps, {df.shape[1]} features")

        # Verify dataset structure
        if 'type' not in df.columns or df.shape[1] < 2:
            raise ValueError("Invalid dataset structure: 'type' column missing or insufficient features")

        # Convert to integer, handling non-numeric values
        for col in df.columns:
            if col != 'type':
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')

        print("\nDataset Distribution:")
        print(f"Benign apps (0): {df['type'].value_counts().get(0, 0)}")
        print(f"Malware apps (1): {df['type'].value_counts().get(1, 0)}")
        balance_ratio = min(df['type'].value_counts()) / max(df['type'].value_counts())
        print(f"Balance ratio: {balance_ratio:.2f} {'(Balanced)' if balance_ratio > 0.8 else '(Imbalanced)'}\n")

        print("Permission Analysis:")
        benign_permissions = df[df['type'] == 0].sum(axis=0).sort_values(ascending=False)[1:11]
        print("\nTop 10 permissions used by benign apps:")
        for i, (perm, count) in enumerate(benign_permissions.items(), 1):
            print(f"{i:2d}. {perm}: {count} apps")

        malware_permissions = df[df['type'] == 1].sum(axis=0).sort_values(ascending=False)
        malware_permissions = malware_permissions[malware_permissions.index != 'type'][:10]
        print("\nTop 10 permissions used by malware apps:")
        for i, (perm, count) in enumerate(malware_permissions.items(), 1):
            print(f"{i:2d}. {perm}: {count} apps")

        create_permission_visualization(df)

        X = df.drop(columns=['type'])
        y = df['type']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.20, random_state=42, stratify=y
        )

        print("\nData Split:")
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Testing set: {X_test.shape[0]} samples")

        return X_train, X_test, y_train, y_test, df

    except Exception as e:
        print(f"Error in static analysis: {str(e)}")
        return None, None, None, None, None

def create_permission_visualization(df):
    """
    Create visualizations comparing permission usage between malware and benign apps.

    Args:
        df (DataFrame): The permissions dataset
    """
    try:
        plt.style.use('seaborn-v0_8')
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

        benign_top = df[df['type'] == 0].sum(axis=0).sort_values(ascending=False)[1:11]
        benign_top.plot.bar(ax=ax1, color='green', alpha=0.7)
        ax1.set_title('Top 10 Permissions - Benign Apps', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Number of Apps')
        ax1.tick_params(axis='x', rotation=45)

        malware_top = df[df['type'] == 1].sum(axis=0).sort_values(ascending=False)
        malware_top = malware_top[malware_top.index != 'type'][:10]
        malware_top.plot.bar(ax=ax2, color='red', alpha=0.7)
        ax2.set_title('Top 10 Permissions - Malware Apps', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Number of Apps')
        ax2.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('permission_analysis.png', dpi=300, bbox_inches='tight')
        print("Visualization saved as 'permission_analysis.png'")
        plt.close()

    except Exception as e:
        print(f"Could not create visualization: {str(e)}")

def perform_dynamic_analysis(data_path):
    """
    Perform dynamic analysis on Android apps by analyzing network traffic.

    Args:
        data_path (str): Path to the network traffic dataset CSV file

    Returns:
        tuple: Processed training and testing data splits, original DataFrame
    """
    print("\nDYNAMIC ANALYSIS: NETWORK TRAFFIC")
    print("-" * 50)

    try:
        data = pd.read_csv(data_path, sep=";", encoding='latin1')
        print(f"Dataset loaded: {data.shape[0]} samples, {data.shape[1]} features")
        print(f"\nColumns: {list(data.columns)}")

        print("\nDataset Distribution:")
        print(f"Benign samples: {data['type'].value_counts().get('benign', 0)}")
        print(f"Malware samples: {data['type'].value_counts().get('malware', 0)}")

        print("\nData Preprocessing:")
        missing_values = data.isna().sum()
        print(f"Missing values: {missing_values.sum()}")
        if missing_values.sum() > 0:
            print(f"Columns with missing values: {missing_values[missing_values > 0].to_dict()}")

        columns_to_drop = []
        for col in ['duracion', 'avg_local_pkt_rate', 'avg_remote_pkt_rate']:
            if col in data.columns:
                columns_to_drop.append(col)
        if 'tcp_urg_packet' in data.columns and data['tcp_urg_packet'].sum() <= 2:
            columns_to_drop.append('tcp_urg_packet')

        if columns_to_drop:
            data = data.drop(columns=columns_to_drop)
            print(f"Dropped columns: {columns_to_drop}")

        duplicates = data.duplicated().sum()
        if duplicates > 0:
            data = data.drop_duplicates()
            print(f"Removed {duplicates} duplicate rows")

        data = remove_outliers(data)
        print(f"Final dataset shape: {data.shape}")

        feature_columns = [col for col in data.columns if col != 'type']
        X = data[feature_columns]
        y = data['type'].astype(str)

        scaler = RobustScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=feature_columns)

        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.25, random_state=45, stratify=y
        )

        print("\nData Split:")
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Testing set: {X_test.shape[0]} samples")

        return X_train, X_test, y_train, y_test, data

    except Exception as e:
        print(f"Error in dynamic analysis: {str(e)}")
        return None, None, None, None, None

def remove_outliers(data):
    """
    Remove outliers from the dataset using statistical thresholds.

    Args:
        data (DataFrame): The dataset to clean

    Returns:
        DataFrame: Cleaned dataset
    """
    original_size = data.shape[0]
    outlier_conditions = [
        ('tcp_packets', 20000),
        ('dist_port_tcp', 1400),
        ('external_ips', 35),
        ('vulume_bytes', 2000000),
        ('udp_packets', 40),
        ('remote_app_packets', 15000)
    ]

    for column, threshold in outlier_conditions:
        if column in data.columns:
            before_count = data.shape[0]
            data = data[data[column] < threshold]
            removed = before_count - data.shape[0]
            if removed > 0:
                print(f"Removed {removed} outliers from {column} (threshold: {threshold})")

    if 'source_app_packets.1' in data.columns:
        data = data.drop('source_app_packets.1', axis=1)
        print("Removed duplicate column: source_app_packets.1")

    print(f"Outlier removal: {original_size} -> {data.shape[0]} samples ({original_size - data.shape[0]} removed)")
    return data

def train_static_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate machine learning models for static analysis.

    Args:
        X_train, X_test, y_train, y_test: Training and testing data splits

    Returns:
        dict: Model results
    """
    print("\nTRAINING MODELS: STATIC ANALYSIS")
    print("-" * 50)

    results = {}
    models = [
        ('Naive Bayes', GaussianNB()),
        ('Decision Tree', DecisionTreeClassifier(random_state=42, max_depth=10)),
        ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=15)),
        ('SVM', SVC(kernel='rbf', random_state=42))
    ]

    best_k = 3
    best_knn_score = 0

    for k in range(3, 16, 3):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred_knn = knn.predict(X_test)
        accuracy = accuracy_score(y_test, pred_knn)
        if accuracy > best_knn_score:
            best_knn_score = accuracy
            best_k = k

    models.append((f"KNN (k={best_k})", KNeighborsClassifier(n_neighbors=best_k)))

    for name, model in models:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        results[name] = evaluate_model(y_test, predictions, name)

    return results

def train_dynamic_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate machine learning models for dynamic analysis.

    Args:
        X_train, X_test, y_train, y_test: Training and testing data splits

    Returns:
        dict: Model results
    """
    print("\nTRAINING MODELS: DYNAMIC ANALYSIS")
    print("-" * 50)

    results = {}
    models = [
        ('Naive Bayes', GaussianNB()),
        ('Random Forest', RandomForestClassifier(n_estimators=250, max_depth=50, random_state=45)),
    ]

    best_k = 3
    best_knn_score = 0

    for k in range(3, 16, 3):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred_knn = knn.predict(X_test)
        accuracy = accuracy_score(y_test, pred_knn)
        if accuracy > best_knn_score:
            best_knn_score = accuracy
            best_k = k

    models.append((f"KNN (k={best_k})", KNeighborsClassifier(n_neighbors=best_k)))

    for name, model in models:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        results[name] = evaluate_model(y_test, predictions, name, use_kappa=True)

        if name == 'Random Forest' and hasattr(model, 'feature_importances_'):
            feature_names = X_train.columns
            feature_importance = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            print(f"\nTop 5 Features (Random Forest):")
            for i, row in feature_importance.head().iterrows():
                print(f"{row['feature']}: {row['importance']:.4f}")

    return results

def evaluate_model(y_true, y_pred, model_name, use_kappa=False):
    """
    Evaluate a machine learning model's performance.

    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name (str): Name of the model
        use_kappa (bool): Whether to include Cohen's Kappa score

    Returns:
        dict: Evaluation metrics
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    if use_kappa:
        kappa = cohen_kappa_score(y_true, y_pred)
        print(f"Cohen's Kappa: {kappa:.4f}")

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'cohen_kappa': kappa if use_kappa else None}

def compare_results(static_results, dynamic_results):
    """
    Compare results from static and dynamic analysis.

    Args:
        static_results (dict): Results from static analysis models
        dynamic_results (dict): Results from dynamic analysis models
    """
    print("\nCOMPARATIVE ANALYSIS")
    print("=" * 60)

    if static_results:
        print("\nSTATIC ANALYSIS: PERMISSION-BASED")
        print("-" * 50)
        best_static = max(static_results.items(), key=lambda x: x[1]['accuracy'])
        print(f"Best Model: {best_static[0]}")
        print(f"Accuracy: {best_static[1]['accuracy']:.4f}")
        print("\nAll Models:")
        for model, metrics in static_results.items():
            print(f"{model:20}: Accuracy = {metrics['accuracy']:.4f}, "
                  f"Precision = {metrics['precision']:.4f}, "
                  f"Recall = {metrics['recall']:.4f}, "
                  f"F1-Score = {metrics['f1_score']:.4f}")

    if dynamic_results:
        print("\nDYNAMIC ANALYSIS: NETWORK TRAFFIC-BASED")
        print("-" * 50)
        best_dynamic = max(dynamic_results.items(), key=lambda x: x[1]['accuracy'])
        print(f"Best Model: {best_dynamic[0]}")
        print(f"Accuracy: {best_dynamic[1]['accuracy']:.4f}")
        print("\nAll Models:")
        for model, metrics in dynamic_results.items():
            print(f"{model:20}: Accuracy = {metrics['accuracy']:.4f}, "
                  f"Precision = {metrics['precision']:.4f}, "
                  f"Recall = {metrics['recall']:.4f}, "
                  f"F1-Score = {metrics['f1_score']:.4f}, "
                  f"Cohen's Kappa = {metrics['cohen_kappa']:.4f}")

    print("\nMODEL COMPARISON")
    print("-" * 50)
    if static_results and dynamic_results:
        print(f"Best Static Model: {best_static[0]} (Accuracy: {best_static[1]['accuracy']:.4f})")
        print(f"Best Dynamic Model: {best_dynamic[0]} (Accuracy: {best_dynamic[1]['accuracy']:.4f})")
        print("\nRecommendation:")
        if best_static[1]['accuracy'] > best_dynamic[1]['accuracy']:
            print("Static analysis performs better. Use permission-based features with "
                  f"{best_static[0]} for optimal malware detection.")
        else:
            print("Dynamic analysis performs better. Use network traffic-based features with "
                  f"{best_dynamic[0]} for optimal malware detection.")

def main():
    """
    Main function to execute the Android malware analysis pipeline.
    """
    print("ANDROID MALWARE ANALYSIS PIPELINE")
    print("=" * 60)
    print(f"Started at: {datetime.now()}")

    try:
        static_data_path, dynamic_data_path = load_datasets()

        static_data = perform_static_analysis(static_data_path)
        static_results = None
        if static_data[0] is not None:
            X_train_static, X_test_static, y_train_static, y_test_static, df_static = static_data
            static_results = train_static_models(X_train_static, X_test_static,
                                              y_train_static, y_test_static)
        else:
            print("Static analysis failed - skipping static models")

        dynamic_data = perform_dynamic_analysis(dynamic_data_path)
        dynamic_results = None
        if dynamic_data[0] is not None:
            X_train_dynamic, X_test_dynamic, y_train_dynamic, y_test_dynamic, df_dynamic = dynamic_data
            dynamic_results = train_dynamic_models(X_train_dynamic, X_test_dynamic,
                                                y_train_dynamic, y_test_dynamic)
        else:
            print("Dynamic analysis failed - skipping dynamic models")

        compare_results(static_results, dynamic_results)

        print("\nANALYSIS SUMMARY")
        print("=" * 60)
        print("Completed successfully")
        print("Analysis types: Static (Permissions), Dynamic (Network Traffic)")
        print("Models evaluated: Naive Bayes, KNN, Decision Tree, Random Forest, SVM")
        print(f"Completed at: {datetime.now()}")

    except Exception as e:
        print(f"Error in execution: {str(e)}")
        print("Please verify dataset availability and Kaggle API credentials.")

if __name__ == "__main__":
    try:
        import matplotlib
        matplotlib.use('Agg')
    except:
        pass
    main()

ANDROID MALWARE ANALYSIS PIPELINE
Started at: 2025-06-20 16:42:31.475777
STATIC ANALYSIS: ANDROID PERMISSIONS
--------------------------------------------------
Dataset loaded: 398 apps, 331 features

Dataset Distribution:
Benign apps (0): 199
Malware apps (1): 199
Balance ratio: 1.00 (Balanced)

Permission Analysis:

Top 10 permissions used by benign apps:
 1. android.permission.WRITE_EXTERNAL_STORAGE: 76 apps
 2. android.permission.ACCESS_NETWORK_STATE: 62 apps
 3. android.permission.WAKE_LOCK: 36 apps
 4. android.permission.RECEIVE_BOOT_COMPLETED: 30 apps
 5. android.permission.ACCESS_WIFI_STATE: 29 apps
 6. android.permission.READ_PHONE_STATE: 24 apps
 7. android.permission.VIBRATE: 21 apps
 8. android.permission.ACCESS_FINE_LOCATION: 18 apps
 9. android.permission.READ_EXTERNAL_STORAGE: 15 apps
10. android.permission.ACCESS_COARSE_LOCATION: 13 apps

Top 10 permissions used by malware apps:
 1. android.permission.INTERNET: 195 apps
 2. android.permission.READ_PHONE_STATE: 190 apps


In [1]:
# Android Malware Analysis using Machine Learning
# Author: Christian Camilo Urcuqui López (Modified and Enhanced)
# Date: June 2025
#
# This script performs Android malware detection using two approaches:
# 1. Static Analysis: Analyzing Android app permissions
# 2. Dynamic Analysis: Analyzing network traffic patterns
#
# Datasets:
# - Permissions: xwolf12/datasetandroidpermissions (train.csv)
# - Network Traffic: xwolf12/network-traffic-android-malware (android_traffic.csv)

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, cohen_kappa_score
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

# Visualization Functions
def plot_class_distribution(y, title, save_path):
    counts = y.value_counts()
    labels = counts.index
    if all(isinstance(label, (int, float)) for label in labels):
        labels = ['Benign' if label == 0 else 'Malware' for label in labels]
    else:
        labels = [label.capitalize() for label in labels]
    plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title(title)
    plt.savefig(save_path)
    plt.close()

def plot_feature_importance(feature_importance, title, save_path, top_n=10):
    feature_importance = feature_importance.sort_values('importance', ascending=False).head(top_n)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_correlation_heatmap(X, title, save_path):
    corr = X.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, cmap='coolwarm', center=0, annot=False)
    plt.title(title)
    plt.savefig(save_path)
    plt.close()

def plot_boxplots(data, columns, title, save_path):
    if not columns:
        return
    n_cols = min(len(columns), 4)
    fig, axes = plt.subplots(1, n_cols, figsize=(5 * n_cols, 5))
    if n_cols == 1:
        axes = [axes]
    for ax, col in zip(axes, columns):
        sns.boxplot(y=data[col], ax=ax)
        ax.set_title(col)
    plt.suptitle(title)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_model_comparison(results, metric, title, save_path):
    models = list(results.keys())
    values = [results[model][metric] for model in models]
    plt.figure(figsize=(10, 6))
    sns.barplot(x=models, y=values)
    plt.title(title)
    plt.ylabel(metric.capitalize())
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

def plot_confusion_matrix(cm, y_true, title, save_path):
    labels = sorted(set(y_true))
    if labels == [0,1]:
        display_labels = ['Benign', 'Malware']
    else:
        display_labels = [label.capitalize() for label in labels]
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=display_labels, yticklabels=display_labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(save_path)
    plt.close()

def load_datasets():
    """Download and return paths to the Kaggle datasets."""
    permissions_path = kagglehub.dataset_download('xwolf12/datasetandroidpermissions')
    traffic_path = kagglehub.dataset_download('xwolf12/network-traffic-android-malware')

    static_data_path = os.path.join(permissions_path, 'train.csv')
    dynamic_data_path = os.path.join(traffic_path, 'android_traffic.csv')

    if not os.path.exists(static_data_path):
        raise FileNotFoundError(f"Static dataset not found at {static_data_path}")
    if not os.path.exists(dynamic_data_path):
        raise FileNotFoundError(f"Dynamic dataset not found at {dynamic_data_path}")

    return static_data_path, dynamic_data_path

def perform_static_analysis(data_path):
    print("STATIC ANALYSIS: ANDROID PERMISSIONS")
    print("-" * 50)

    try:
        df = pd.read_csv(data_path, sep=";")
        print(f"Dataset loaded: {df.shape[0]} apps, {df.shape[1]} features")

        if 'type' not in df.columns or df.shape[1] < 2:
            raise ValueError("Invalid dataset structure: 'type' column missing or insufficient features")

        for col in df.columns:
            if col != 'type':
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')

        print("\nDataset Distribution:")
        print(f"Benign apps (0): {df['type'].value_counts().get(0, 0)}")
        print(f"Malware apps (1): {df['type'].value_counts().get(1, 0)}")
        balance_ratio = min(df['type'].value_counts()) / max(df['type'].value_counts())
        print(f"Balance ratio: {balance_ratio:.2f} {'(Balanced)' if balance_ratio > 0.8 else '(Imbalanced)'}\n")

        # Plot class distribution
        plot_class_distribution(df['type'], 'Static Analysis Class Distribution', 'static_class_distribution.png')

        print("Permission Analysis:")
        benign_permissions = df[df['type'] == 0].sum(axis=0).sort_values(ascending=False)[1:11]
        print("\nTop 10 permissions used by benign apps:")
        for i, (perm, count) in enumerate(benign_permissions.items(), 1):
            print(f"{i:2d}. {perm}: {count} apps")

        malware_permissions = df[df['type'] == 1].sum(axis=0).sort_values(ascending=False)
        malware_permissions = malware_permissions[malware_permissions.index != 'type'][:10]
        print("\nTop 10 permissions used by malware apps:")
        for i, (perm, count) in enumerate(malware_permissions.items(), 1):
            print(f"{i:2d}. {perm}: {count} apps")

        create_permission_visualization(df)

        X = df.drop(columns=['type'])
        y = df['type']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.20, random_state=42, stratify=y
        )

        print("\nData Split:")
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Testing set: {X_test.shape[0]} samples")

        return X_train, X_test, y_train, y_test, df

    except Exception as e:
        print(f"Error in static analysis: {str(e)}")
        return None, None, None, None, None

def create_permission_visualization(df):
    try:
        plt.style.use('seaborn-v0_8')
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

        benign_top = df[df['type'] == 0].sum(axis=0).sort_values(ascending=False)[1:11]
        benign_top.plot.bar(ax=ax1, color='green', alpha=0.7)
        ax1.set_title('Top 10 Permissions - Benign Apps', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Number of Apps')
        ax1.tick_params(axis='x', rotation=45)

        malware_top = df[df['type'] == 1].sum(axis=0).sort_values(ascending=False)
        malware_top = malware_top[malware_top.index != 'type'][:10]
        malware_top.plot.bar(ax=ax2, color='red', alpha=0.7)
        ax2.set_title('Top 10 Permissions - Malware Apps', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Number of Apps')
        ax2.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('permission_analysis.png', dpi=300, bbox_inches='tight')
        print("Visualization saved as 'permission_analysis.png'")
        plt.close()

    except Exception as e:
        print(f"Could not create visualization: {str(e)}")

def perform_dynamic_analysis(data_path):
    print("\nDYNAMIC ANALYSIS: NETWORK TRAFFIC")
    print("-" * 50)

    try:
        data = pd.read_csv(data_path, sep=";", encoding='latin1')
        print(f"Dataset loaded: {data.shape[0]} samples, {data.shape[1]} features")
        print(f"\nColumns: {list(data.columns)}")

        print("\nDataset Distribution:")
        print(f"Benign samples: {data['type'].value_counts().get('benign', 0)}")
        print(f"Malware samples: {data['type'].value_counts().get('malware', 0)}")

        # Plot class distribution
        plot_class_distribution(data['type'], 'Dynamic Analysis Class Distribution', 'dynamic_class_distribution.png')

        print("\nData Preprocessing:")
        missing_values = data.isna().sum()
        print(f"Missing values: {missing_values.sum()}")
        if missing_values.sum() > 0:
            print(f"Columns with missing values: {missing_values[missing_values > 0].to_dict()}")

        columns_to_drop = []
        for col in ['duracion', 'avg_local_pkt_rate', 'avg_remote_pkt_rate']:
            if col in data.columns:
                columns_to_drop.append(col)
        if 'tcp_urg_packet' in data.columns and data['tcp_urg_packet'].sum() <= 2:
            columns_to_drop.append('tcp_urg_packet')

        if columns_to_drop:
            data = data.drop(columns=columns_to_drop)
            print(f"Dropped columns: {columns_to_drop}")

        duplicates = data.duplicated().sum()
        if duplicates > 0:
            data = data.drop_duplicates()
            print(f"Removed {duplicates} duplicate rows")

        # Plot boxplots before outlier removal
        outlier_columns = ['tcp_packets', 'dist_port_tcp', 'external_ips', 'vulume_bytes', 'udp_packets', 'remote_app_packets']
        outlier_columns = [col for col in outlier_columns if col in data.columns]
        if outlier_columns:
            plot_boxplots(data, outlier_columns[:4], 'Dynamic Analysis: Before Outlier Removal', 'dynamic_before_outliers.png')

        data = remove_outliers(data)
        print(f"Final dataset shape: {data.shape}")

        # Plot boxplots after outlier removal
        if outlier_columns:
            plot_boxplots(data, outlier_columns[:4], 'Dynamic Analysis: After Outlier Removal', 'dynamic_after_outliers.png')

        feature_columns = [col for col in data.columns if col != 'type']
        X = data[feature_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

        # Plot correlation heatmap
        plot_correlation_heatmap(X, 'Dynamic Analysis: Feature Correlation Heatmap', 'dynamic_correlation_heatmap.png')

        y = data['type'].astype(str)

        scaler = RobustScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=feature_columns)

        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.25, random_state=45, stratify=y
        )

        print("\nData Split:")
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Testing set: {X_test.shape[0]} samples")

        return X_train, X_test, y_train, y_test, data

    except Exception as e:
        print(f"Error in dynamic analysis: {str(e)}")
        return None, None, None, None, None

def remove_outliers(data):
    original_size = data.shape[0]
    outlier_conditions = [
        ('tcp_packets', 20000),
        ('dist_port_tcp', 1400),
        ('external_ips', 35),
        ('vulume_bytes', 2000000),
        ('udp_packets', 40),
        ('remote_app_packets', 15000)
    ]

    for column, threshold in outlier_conditions:
        if column in data.columns:
            before_count = data.shape[0]
            data = data[data[column] < threshold]
            removed = before_count - data.shape[0]
            if removed > 0:
                print(f"Removed {removed} outliers from {column} (threshold: {threshold})")

    if 'source_app_packets.1' in data.columns:
        data = data.drop('source_app_packets.1', axis=1)
        print("Removed duplicate column: source_app_packets.1")

    print(f"Outlier removal: {original_size} -> {data.shape[0]} samples ({original_size - data.shape[0]} removed)")
    return data

def train_static_models(X_train, X_test, y_train, y_test):
    print("\nTRAINING MODELS: STATIC ANALYSIS")
    print("-" * 50)

    # Find best k for KNN
    best_k = 3
    best_knn_score = 0
    for k in range(3, 16, 3):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred_knn = knn.predict(X_test)
        accuracy = accuracy_score(y_test, pred_knn)
        if accuracy > best_knn_score:
            best_knn_score = accuracy
            best_k = k

    models = {
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=15),
        'SVM': SVC(kernel='rbf', random_state=42),
        f'KNN (k={best_k})': KNeighborsClassifier(n_neighbors=best_k)
    }

    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        results[name] = evaluate_model(y_test, predictions, name)

        if name == 'Random Forest':
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': model.feature_importances_
            })
            plot_feature_importance(feature_importance, 'Static Analysis: Top 10 Important Permissions', 'static_feature_importance.png')

    # Plot model comparison
    plot_model_comparison(results, 'accuracy', 'Static Analysis: Model Accuracy Comparison', 'static_model_accuracy.png')

    # Plot confusion matrix for best model
    best_model_name = max(results, key=lambda k: results[k]['accuracy'])
    best_model = models[best_model_name]
    best_predictions = best_model.predict(X_test)
    cm = confusion_matrix(y_test, best_predictions)
    plot_confusion_matrix(cm, y_test, f'Static Analysis: Best Model ({best_model_name})', 'static_best_cm.png')

    return results

def train_dynamic_models(X_train, X_test, y_train, y_test):
    print("\nTRAINING MODELS: DYNAMIC ANALYSIS")
    print("-" * 50)

    # Find best k for KNN
    best_k = 3
    best_knn_score = 0
    for k in range(3, 16, 3):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred_knn = knn.predict(X_test)
        accuracy = accuracy_score(y_test, pred_knn)
        if accuracy > best_knn_score:
            best_knn_score = accuracy
            best_k = k

    models = {
        'Naive Bayes': GaussianNB(),
        'Random Forest': RandomForestClassifier(n_estimators=250, max_depth=50, random_state=45),
        f'KNN (k={best_k})': KNeighborsClassifier(n_neighbors=best_k)
    }

    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        results[name] = evaluate_model(y_test, predictions, name, use_kappa=True)

        if name == 'Random Forest':
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': model.feature_importances_
            })
            plot_feature_importance(feature_importance, 'Dynamic Analysis: Top 10 Important Features', 'dynamic_feature_importance.png')

    # Plot model comparison
    plot_model_comparison(results, 'accuracy', 'Dynamic Analysis: Model Accuracy Comparison', 'dynamic_model_accuracy.png')

    # Plot confusion matrix for best model
    best_model_name = max(results, key=lambda k: results[k]['accuracy'])
    best_model = models[best_model_name]
    best_predictions = best_model.predict(X_test)
    cm = confusion_matrix(y_test, best_predictions)
    plot_confusion_matrix(cm, y_test, f'Dynamic Analysis: Best Model ({best_model_name})', 'dynamic_best_cm.png')

    return results

def evaluate_model(y_true, y_pred, model_name, use_kappa=False):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")

    kappa = None
    if use_kappa:
        kappa = cohen_kappa_score(y_true, y_pred)
        print(f"Cohen's Kappa: {kappa:.4f}")

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'kappa': kappa}

def compare_results(static_results, dynamic_results):
    print("\nCOMPARATIVE ANALYSIS")
    print("=" * 60)

    if static_results:
        print("\nSTATIC ANALYSIS: PERMISSION-BASED")
        print("-" * 50)
        best_static = max(static_results.items(), key=lambda x: x[1]['accuracy'])
        print(f"Best Model: {best_static[0]}")
        print(f"Accuracy: {best_static[1]['accuracy']:.4f}")
        print("\nAll Models:")
        for model, metrics in static_results.items():
            print(f"{model:20}: Accuracy = {metrics['accuracy']:.4f}, "
                  f"Precision = {metrics['precision']:.4f}, "
                  f"Recall = {metrics['recall']:.4f}, "
                  f"F1-Score = {metrics['f1_score']:.4f}")

    if dynamic_results:
        print("\nDYNAMIC ANALYSIS: NETWORK TRAFFIC-BASED")
        print("-" * 50)
        best_dynamic = max(dynamic_results.items(), key=lambda x: x[1]['accuracy'])
        print(f"Best Model: {best_dynamic[0]}")
        print(f"Accuracy: {best_dynamic[1]['accuracy']:.4f}")
        print("\nAll Models:")
        for model, metrics in dynamic_results.items():
            print(f"{model:20}: Accuracy = {metrics['accuracy']:.4f}, "
                  f"Precision = {metrics['precision']:.4f}, "
                  f"Recall = {metrics['recall']:.4f}, "
                  f"F1-Score = {metrics['f1_score']:.4f}, "
                  f"Cohen's Kappa = {metrics['kappa']:.4f}")

    print("\nMODEL COMPARISON")
    print("-" * 50)
    if static_results and dynamic_results:
        print(f"Best Static Model: {best_static[0]} (Accuracy: {best_static[1]['accuracy']:.4f})")
        print(f"Best Dynamic Model: {best_dynamic[0]} (Accuracy: {best_dynamic[1]['accuracy']:.4f})")
        print("\nRecommendation:")
        if best_static[1]['accuracy'] > best_dynamic[1]['accuracy']:
            print("Static analysis performs better. Use permission-based features with "
                  f"{best_static[0]} for optimal malware detection.")
        else:
            print("Dynamic analysis performs better. Use network traffic-based features with "
                  f"{best_dynamic[0]} for optimal malware detection.")

def main():
    print("ANDROID MALWARE ANALYSIS PIPELINE")
    print("=" * 60)
    print(f"Started at: {datetime.now()}")

    try:
        static_data_path, dynamic_data_path = load_datasets()

        static_data = perform_static_analysis(static_data_path)
        static_results = None
        if static_data[0] is not None:
            X_train_static, X_test_static, y_train_static, y_test_static, df_static = static_data
            static_results = train_static_models(X_train_static, X_test_static,
                                              y_train_static, y_test_static)
        else:
            print("Static analysis failed - skipping static models")

        dynamic_data = perform_dynamic_analysis(dynamic_data_path)
        dynamic_results = None
        if dynamic_data[0] is not None:
            X_train_dynamic, X_test_dynamic, y_train_dynamic, y_test_dynamic, df_dynamic = dynamic_data
            dynamic_results = train_dynamic_models(X_train_dynamic, X_test_dynamic,
                                                y_train_dynamic, y_test_dynamic)
        else:
            print("Dynamic analysis failed - skipping dynamic models")

        compare_results(static_results, dynamic_results)

        print("\nANALYSIS SUMMARY")
        print("=" * 60)
        print("Completed successfully")
        print("Analysis types: Static (Permissions), Dynamic (Network Traffic)")
        print("Models evaluated: Naive Bayes, KNN, Decision Tree, Random Forest, SVM")
        print(f"Completed at: {datetime.now()}")

    except Exception as e:
        print(f"Error in execution: {str(e)}")
        print("Please verify dataset availability and Kaggle API credentials.")

if __name__ == "__main__":
    try:
        import matplotlib
        matplotlib.use('Agg')
    except:
        pass
    main()

ANDROID MALWARE ANALYSIS PIPELINE
Started at: 2025-06-20 16:41:37.082929
Downloading from https://www.kaggle.com/api/v1/datasets/download/xwolf12/datasetandroidpermissions?dataset_version_number=1...


100%|██████████| 9.03k/9.03k [00:00<00:00, 14.1MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/xwolf12/network-traffic-android-malware?dataset_version_number=1...


100%|██████████| 114k/114k [00:00<00:00, 54.8MB/s]

Extracting files...
STATIC ANALYSIS: ANDROID PERMISSIONS
--------------------------------------------------
Dataset loaded: 398 apps, 331 features






Dataset Distribution:
Benign apps (0): 199
Malware apps (1): 199
Balance ratio: 1.00 (Balanced)

Permission Analysis:

Top 10 permissions used by benign apps:
 1. android.permission.WRITE_EXTERNAL_STORAGE: 76 apps
 2. android.permission.ACCESS_NETWORK_STATE: 62 apps
 3. android.permission.WAKE_LOCK: 36 apps
 4. android.permission.RECEIVE_BOOT_COMPLETED: 30 apps
 5. android.permission.ACCESS_WIFI_STATE: 29 apps
 6. android.permission.READ_PHONE_STATE: 24 apps
 7. android.permission.VIBRATE: 21 apps
 8. android.permission.ACCESS_FINE_LOCATION: 18 apps
 9. android.permission.READ_EXTERNAL_STORAGE: 15 apps
10. android.permission.ACCESS_COARSE_LOCATION: 13 apps

Top 10 permissions used by malware apps:
 1. android.permission.INTERNET: 195 apps
 2. android.permission.READ_PHONE_STATE: 190 apps
 3. android.permission.ACCESS_NETWORK_STATE: 167 apps
 4. android.permission.WRITE_EXTERNAL_STORAGE: 136 apps
 5. android.permission.ACCESS_WIFI_STATE: 135 apps
 6. android.permission.READ_SMS: 124 ap