In [None]:

import os
import pandas as pd

from scipy.sparse import hstack, csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import numpy as np
import matplotlib.pyplot as plt

## Data Validation

* Explore the first 10.000 rows of the dataset to determine data preparation strategy

In [None]:
train_data = pd.read_csv("c:\Users\Abhinav sinha\OneDrive\Desktop\OSData_Train .csv", nrows=10000)  # read a few rows to start

In [None]:
train_data['Category'].unique()

In [None]:
train_data['EvidenceRole'].unique()

In [None]:
train_data['CountryCode'].unique()

In [None]:
train_data.iloc[:,:15].head()

In [None]:
train_data.iloc[:, [0, 9] + list(range(15, 30))].head()

In [None]:
train_data.iloc[:, [0, 9] + list(range(30, 45))].head()

In [None]:
# Count summary of Incident Grade classes
train_data['IncidentGrade'].value_counts()

In [None]:
# Percentage count summary of Incident Grade classes
train_data['IncidentGrade'].value_counts() * 100 / train_data['IncidentGrade'].shape[0]

In [None]:
train_data.isnull().sum()

## Data Preparation

In [None]:
# def prepare_data():
    

In [None]:
le_cat_columns = ['Category', 'EntityType', 'EvidenceRole', 'SuspicionLevel', 'LastVerdict',
                  'ResourceType', 'Roles', 'AntispamDirection', 'ThreatFamily','CountryCode',
                  'OSFamily', 'OSVersion','State', 'City', 'RegistryValueName', 'RegistryValueData', 
                  'ResourceIdName', 'RegistryKey', 'OAuthApplicationId', 'ApplicationId', 'ApplicationName']

numerical_columns = ['DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId',
                     'AccountName', 'DeviceName', 'NetworkMessageId', 'EmailClusterId', 'FileName', 'FolderPath']

le_cat_columns += numerical_columns

numerical_columns = []

ohe_cat_columns = []

In [None]:
train_data[le_cat_columns].nunique().sort_values(ascending=False)

In [None]:
train_data[numerical_columns].nunique().sort_values(ascending=False)

In [None]:
# Inspect columns with number of unique values less than 10
for col in train_data:
    if train_data[col].nunique() < 10:
        print(col, train_data[col].unique())

## Data Exploration

In [None]:
def preprocess_data(df, le_cat_columns):
    """
        This function preprocesses the dataset
    """
    
    # Converts columns with fewer than 20 unique values to ohe categorical columns
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Converts numerical to label encoded categorical columns
    for le_col in le_cat_columns:
        df[le_col] = df[le_col].astype('object')

    return df

In [None]:
train_data = preprocess_data(train_data, le_cat_columns)

In [None]:
print(train_data[le_cat_columns].nunique())
print(train_data[ohe_cat_columns].nunique())
print(train_data[numerical_columns].nunique())

### ANOVA F-Statistic

**Goal**: To assess the statistical significance of a feature in predicting the target variable.

After encoding the categorical features as numerical values, ANOVA (Analysis of Variance) is used to measure the significance of each feature.

**Method:**

ANOVA compares the means of different groups and determines if the differences between those means are statistically significant.
The larger the F-statistic, the more significant the feature is as a predictor.

**Findings:**

* **Country Code**, **State** and **City**  are the most significant predictors.
* **Resource Type**, **RegistryValueName**, **RegistryValueData**, **Roles** do not appear to have strong significance as predictors.

To assess the statistical significance of a feature on the target variable.

After the categorical feature is encoded as numerical values, ANOVA is used to measure the significance of the feature.

In [None]:
from sklearn.feature_selection import f_classif

cat_columns = ohe_cat_columns + le_cat_columns

# stats_data = pd.DataFrame()

for cat in cat_columns:
    # One-Hot Encode the categorical data
    onehot_encoder = OneHotEncoder(sparse_output=False)  # Adjust for the FutureWarning
    X_encoded = onehot_encoder.fit_transform(train_data[[cat]])  # Use double brackets to pass a 2D array
    
    # ANOVA F-Statistic
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(train_data['IncidentGrade'])  # Assuming IncidentGrade is categorical
    f_statistic, p_value = f_classif(X_encoded, y)
    
    print("*" * 20)
    print(f"Feature: {cat}")
    print(f"ANOVA F-Statistic: {f_statistic}")
    print(f"p-Value: {p_value}")

In [None]:
# Convert 'Timestamp' column to datetime
train_data['Timestamp'] = pd.to_datetime(train_data['Timestamp'])

train_data.info()

## Data Preprocessing

In [None]:
def process_data():
    train_data = pd.read_csv("c:\Users\Abhinav sinha\OneDrive\Desktop\OSData_Train .csv") 
    test_data = pd.read_csv("C:\Users\Abhinav sinha\OneDrive\Desktop\OSData_Test.csv")
    
    print(train_data.shape)
    
    # Drop rows with missing target variable 'IncidentGrade'
    train_data.dropna(subset=['IncidentGrade'], inplace=True)
    
    train_data = preprocess_data(train_data, le_cat_columns)
    test_data = preprocess_data(test_data, le_cat_columns)
    
    group_columns = ohe_cat_columns + numerical_columns + le_cat_columns
    
    # Drop duplicates based on the specified columns
    train_data = train_data.drop_duplicates(subset=group_columns)
    
    # Drop usage column as it is not present in train dataset
    test_data.drop(['Usage'], axis=1, inplace=True)
    
    print(train_data.shape)
    print(test_data.shape)
    
    #  One hot encoding
    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe.fit(train_data[ohe_cat_columns])

    train_data_ohe = csr_matrix(ohe.transform(train_data[ohe_cat_columns]))
    test_data_ohe = csr_matrix(ohe.transform(test_data[ohe_cat_columns]))

    # Fill NaNs for numerical columns
    train_data_numerical = csr_matrix(train_data[numerical_columns].fillna(-1).values)
    test_data_numerical = csr_matrix(test_data[numerical_columns].fillna(-1).values)
    
    # Feature label encoding
    feature_le = LabelEncoder()
    
    train_data_le = pd.DataFrame()
    test_data_le = pd.DataFrame()
    
    # Fit and transform the feature variables
    for le_col in le_cat_columns:
        # we want to stack train and test for label encoding of some cat variables
        feature_le.fit(pd.concat([train_data[le_col], test_data[le_col]]))
        train_data_le[le_col] = feature_le.transform(train_data[le_col])
        test_data_le[le_col] = feature_le.transform(test_data[le_col])
    
    train_data_le = csr_matrix(train_data_le)
    test_data_le = csr_matrix(test_data_le)
    
    X_train = hstack([train_data_ohe, train_data_le ,train_data_numerical])
    X_test = hstack([test_data_ohe, test_data_le, test_data_numerical])

    # Target label encoding
    target_le = LabelEncoder()
    
    # Fit and transform the target variable
    target_le.fit(train_data['IncidentGrade'])
    y_train = target_le.transform(train_data['IncidentGrade'])
    y_test = target_le.transform(test_data['IncidentGrade'])
    
    # Print out the label classes of the target variable
    """
        0: 'BenignPositive'
        1: 'FalsePositive'
        2: 'TruePositive'
    """
    print(f"Target Classes: {target_le.classes_}")
        
    return X_train, y_train, X_test, y_test
    
    
# get the data
X_train, y_train, X_test, y_test = process_data()

## Modeling and Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def predict(model, X_test, y_test):
    # Generate predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    # Print accuracy
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Print confusion matrix
    print("\nConfusion Matrix:")
    
    cm = confusion_matrix(y_test, y_pred)
    
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, 
                                        display_labels = ['BenignPositive', 'FalsePositive', 'TruePositive'])

    cm_display.plot()
    plt.show()

    return y_pred

### Random Forest Classifier

In [None]:
def train_random_forest_classifier(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    
    model.fit(X_train, y_train)
    
    # Feature importance
    importances = model.feature_importances_
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    # Plot feature importance
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (Random Forest Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
# train a random forest classifier model
rfc_model = train_random_forest_classifier(X_train, y_train)

In [None]:
# make predictions
y_pred = predict(rfc_model, X_test, y_test)

# evaluate test performance
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')

f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))

### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

def train_xgboost_classifier(X_train, y_train):
    model = XGBClassifier(n_estimators=100, max_depth=5, random_state=0, use_label_encoder=False, eval_metric='mlogloss')
    
    model.fit(X_train, y_train)
    
    # Feature importance
    importances = model.feature_importances_
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    # Plot feature importance
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (XGBoost Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
# train a random forest classifier model
xgb_model = train_xgboost_classifier(X_train, y_train)

In [None]:
# make predictions
y_pred = predict(xgb_model, X_test, y_test)

# evaluate test performance
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))

### CatBoost

In [None]:
from catboost import CatBoostClassifier

def train_catboost_classifier(X_train, y_train):
    model = CatBoostClassifier(iterations=100, depth=5, random_seed=0, verbose=0)
    
    model.fit(X_train, y_train)
    
    # Feature importance
    importances = model.get_feature_importance()
    
    feature_columns = np.array(ohe_cat_columns + le_cat_columns + numerical_columns)
    
    # Plot feature importance
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title("Feature Importances (CatBoost Classifier)")
    plt.bar(range(X_train.shape[1]), importances[indices], align="center")
    plt.xticks(range(X_train.shape[1]), feature_columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return model

In [None]:
# train a random forest classifier model
cat_model = train_catboost_classifier(X_train, y_train)

In [None]:
# make predictions
y_pred = predict(cat_model, X_test, y_test)

# evaluate test performance
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy: {}'.format(accuracy))
print('Macro-Precision: {}'.format(precision))
print('Macro-Recall: {}'.format(recall))
print('Macro-F1 Score: {}'.format(f1))