In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

system_threat_forecaster_path = kagglehub.competition_download('System-Threat-Forecaster')

print('Data source import complete.')


# Importing Necessary Packages

In [None]:
# ================================
# 📌 Essential Libraries for ML Workflow
# ================================

# 📊 Data Manipulation & Numerical Computation
import pandas as pd  # Efficient data handling & processing
import numpy as np  # Numerical operations & array manipulation

# 📈 Data Visualization
import matplotlib.pyplot as plt  # Basic plotting functions
import seaborn as sns  # Advanced & aesthetic statistical visualizations

# 🔍 Model Selection & Evaluation
from sklearn.model_selection import (
    train_test_split,  # Splitting dataset into training & validation sets
    cross_val_score,  # Evaluating model performance via cross-validation
    StratifiedKFold  # Ensuring class balance in K-Fold cross-validation
)

# ⚙️ Data Preprocessing
from sklearn.impute import SimpleImputer  # Handling missing values
from sklearn.preprocessing import StandardScaler  # Standardizing numerical features
from sklearn.preprocessing import OneHotEncoder  # Encoding categorical variables

# 🚀 Feature Engineering & Pipelines
from sklearn.pipeline import Pipeline  # Creating streamlined ML workflows
from sklearn.decomposition import PCA  # Dimensionality reduction
from sklearn.feature_selection import SelectKBest, chi2  # Selecting best features
from sklearn.compose import ColumnTransformer  # Handling multiple transformations efficiently

# 🏆 Machine Learning Models
from sklearn.linear_model import (
    LogisticRegression,  # Logistic Regression for classification
    SGDClassifier  # Stochastic Gradient Descent (SGD) classifier
)
from sklearn.ensemble import RandomForestClassifier  # Random Forest ensemble model

# 🚀 Gradient Boosting Models (Boosted Trees)
from xgboost import XGBClassifier  # XGBoost: High-performance gradient boosting
from lightgbm import LGBMClassifier  # LightGBM: Fast & efficient boosting

# 🔧 Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV  # Fine-tuning models for best performance


# Data Loading

In [None]:
train_data=pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
test_data=pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')

In [None]:
# Set option to display all rows
pd.set_option('display.max_rows', None)

# Set option to display all columns
pd.set_option('display.max_columns', None)


In [None]:
train_data.head()

In [None]:
print(f"\033[1mShape of the training data is:\033[0m {train_data.shape}")

# Data Exploration

In [None]:
# Examine data types
print("\033[1mData Types:\033[0m")
print(train_data.info())

In [None]:
# Check for missing values
missing_values = train_data.isnull().sum()
missing_percentage = (missing_values / train_data.shape[0]) * 100
print("\033[1m\nMissing Values (Percentage):\033[0m")
print(missing_percentage[missing_percentage > 0].sort_values(ascending=False))


In [None]:
# Unique value counts
print("\033[1m\nUnique Value Counts:\033[0m")
for col in train_data.columns:
    print(f"{col}: {train_data[col].nunique()}")

In [None]:
# Summary statistics for numerical features
print("\033[1m\nSummary Statistics for Numerical Features:\033[0m")
print(train_data.describe())

In [None]:
# Summary statistics for categorical features
print("\033[1m\nSummary Statistics for Categorical Features:\033[0m")
print(train_data.describe(include=[object]))

# Data visualization

## Numerical Features

In [None]:
import warnings
warnings.filterwarnings("ignore")
# warnings.filterwarnings("ignore", category=UserWarning)  # Ignores only UserWarnings


In [None]:
# Histograms for Numerical Features
numerical_features = train_data.select_dtypes(include=['number']).columns
num_numerical = len(numerical_features)
rows = (num_numerical + 4) // 5  # Calculate rows for subplots
plt.figure(figsize=(20, rows * 4))
for i, col in enumerate(numerical_features):
    plt.subplot(rows, 5, i + 1)
    sns.histplot(train_data[col], kde=True)
    plt.title(col)
    plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
# Box plots for numerical features
for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(y=train_data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Correlation matrix heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(train_data[numerical_features].corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
correlation_matrix = train_data.select_dtypes(include=['number']).corr()

# Get correlations of all features with the target column
correlations = correlation_matrix['target'].drop('target')

# Find most positively and negatively correlated features
most_positive_feature = correlations.idxmax()
most_negative_feature = correlations.idxmin()

print(f"📈 Most Positively Correlated Feature: {most_positive_feature} (Correlation: {correlations.max():.4f})")
print(f"📉 Most Negatively Correlated Feature: {most_negative_feature} (Correlation: {correlations.min():.4f})")


## Categorical Features

In [None]:
print("\033[1mDistribution of  Categorical variable:\033[1m")
# Visualizing key categorical variables
fig, axes = plt.subplots(4, 4, figsize=(12, 10))
sns.countplot(y=train_data['ProductName'], order=train_data['ProductName'].value_counts().index, ax=axes[0, 0], palette='viridis')
axes[0, 0].set_title("Product Name Distribution")

sns.countplot(y=train_data['PlatformType'], order=train_data['PlatformType'].value_counts().index, ax=axes[0, 1], palette='viridis')
axes[0, 1].set_title("Platform Type Distribution")

sns.countplot(y=train_data['Processor'], order=train_data['Processor'].value_counts().index, ax=axes[0, 2], palette='viridis')
axes[0, 2].set_title("Processor Distribution")

sns.countplot(y=train_data['PowerPlatformRole'], order=train_data['PowerPlatformRole'].value_counts().index, ax=axes[0, 3], palette='viridis')
axes[0, 3].set_title("Power Platform Role Distribution")

sns.countplot(y=train_data['OSVersion'], order=train_data['OSVersion'].value_counts().index, ax=axes[1, 0], palette='viridis')
axes[1, 0].set_title("OS Version Distribution")

sns.countplot(y=train_data['OsPlatformSubRelease'], order=train_data['OsPlatformSubRelease'].value_counts().index, ax=axes[1, 1], palette='viridis')
axes[1, 1].set_title("Os Platform Sub Release Distribution")

sns.countplot(y=train_data['SKUEditionName'], order=train_data['SKUEditionName'].value_counts().index, ax=axes[1, 2], palette='viridis')
axes[1, 2].set_title("SKU Edition Name Distribution")

sns.countplot(y=train_data['OSArchitecture'], order=train_data['OSArchitecture'].value_counts().index, ax=axes[1, 3], palette='viridis')
axes[1, 3].set_title("OS Architecture Distribution")

sns.countplot(y=train_data['MDC2FormFactor'], order=train_data['MDC2FormFactor'].value_counts().index, ax=axes[2, 0], palette='viridis')
axes[2, 0].set_title("MDC 2 Form Factor Distribution")

sns.countplot(y=train_data['DeviceFamily'], order=train_data['DeviceFamily'].value_counts().index, ax=axes[2, 1], palette='viridis')
axes[2, 1].set_title("Device Family Distribution")

sns.countplot(y=train_data['PrimaryDiskType'], order=train_data['PrimaryDiskType'].value_counts().index, ax=axes[2, 2], palette='viridis')
axes[2, 2].set_title("Primary Disk Type Distribution")

sns.countplot(y=train_data['OSBranch'], order=train_data['OSBranch'].value_counts().index, ax=axes[2, 3], palette='viridis')
axes[2, 3].set_title("OS Branch Distribution")

sns.countplot(y=train_data['OSInstallType'], order=train_data['OSInstallType'].value_counts().index, ax=axes[3, 0], palette='viridis')
axes[3, 0].set_title("OS Install Type Distribution")

sns.countplot(y=train_data['OSGenuineState'], order=train_data['OSGenuineState'].value_counts().index, ax=axes[3, 1], palette='viridis')
axes[3, 1].set_title("OS Genuine State Distribution")

sns.countplot(y=train_data['LicenseActivationChannel'], order=train_data['LicenseActivationChannel'].value_counts().index, ax=axes[3, 2], palette='viridis')
axes[3, 2].set_title("License Activation Channel Distribution")

sns.countplot(y=train_data['FlightRing'], order=train_data['FlightRing'].value_counts().index, ax=axes[3, 3], palette='viridis')
axes[3, 3].set_title("Flight Ring Distribution")

plt.tight_layout()
plt.show()

In [None]:
# Visualizing key categorical variables
fig, axes = plt.subplots(3, 2, figsize=(17, 17))

sns.countplot(y=train_data['EngineVersion'], order=train_data['EngineVersion'].value_counts().index, ax=axes[0, 0], palette='viridis')
axes[0, 0].set_title("Engine Version Distribution")

sns.countplot(y=train_data['ChassisType'], order=train_data['ChassisType'].value_counts().index, ax=axes[0, 1], palette='plasma')
axes[0, 1].set_title("Chassis Type Distribution")

sns.countplot(y=train_data['OSEdition'], order=train_data['OSEdition'].value_counts().index, ax=axes[1, 0], palette='coolwarm')
axes[1, 0].set_title("OS Edition Distribution")

sns.countplot(y=train_data['OSSkuFriendlyName'], order=train_data['OSSkuFriendlyName'].value_counts().index, ax=axes[1, 1], palette='magma')
axes[1, 1].set_title("OSSku Friendly Name Distribution")

sns.countplot(y=train_data['AutoUpdateOptionsName'], order=train_data['AutoUpdateOptionsName'].value_counts().index, ax=axes[2, 0], palette='magma')
axes[2, 0].set_title("Auto Update Options Name Distribution")

sns.countplot(y=train_data['AppVersion'], order=train_data['AppVersion'].value_counts().index, ax=axes[2, 1], palette='magma')
axes[2, 1].set_title("App Version Distribution")

plt.tight_layout()
plt.show()


# Data Cleaning

In [None]:
# Remove duplicate rows from the training dataset to ensure each entry is unique
# This helps prevent model bias and improves the quality of the training data
train_data.drop_duplicates(inplace=True)

In [None]:
# Drop irrelevant or redundant columns to reduce noise and prevent overfitting
train_data_cleaned = train_data.drop(columns=[
    'IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled',
    'GeoRegionID', 'ChassisType', 'MachineID', 'ProcessorManufacturerID',
    'ProcessorModelID', 'EnableLUA'
])

1. These columns provide no useful information because they have **constant values** (always 0):

* IsBetaUser
* AutoSampleSubmissionEnabled
* IsFlightsDisabled
2. When features are highly correlated (multicollinearity), they can **confuse models** and increase training time.
    * GeoRegionID **(keep CountryID)**
    * ChassisType **(keep InternalBatteryNumberOfCharges)**
    


In [None]:
# Separate features and target
X = train_data_cleaned.drop(columns=['target'])
y = train_data_cleaned['target']

# X = train_data.drop(columns=['target'])
# y = train_data['target']

In [None]:
# Getting numerical feature names as a list
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Getting categorical feature names as a list
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Data Preprocessing

In [None]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore')), # One-hot encode
])

In [None]:
preprocessor = ColumnTransformer([
    ('categorical_processing', categorical_pipeline, categorical_features),
    ('numerical_processing', numerical_pipeline, numerical_features)
], remainder='drop')

preprocessor

In [None]:
# Split into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape, X_val.shape

## Feature Scaling

In [None]:
numerical_Pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler()),  # Scale numerical features
    ('pca' , PCA(n_components=25))
])

categorical_Pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore')), # One-hot encode
    ('select_best', SelectKBest(chi2, k=15))
])

In [None]:
Preprocessor = ColumnTransformer([
    ('categorical_processing', categorical_Pipeline, categorical_features),
    ('numerical_processing', numerical_Pipeline, numerical_features)
], remainder='drop')

Preprocessor

# Model Building

## LogisticRegression

In [None]:
# Define the pipeline
logistic_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logistic", LogisticRegression(random_state = 0, max_iter=600))
])

# Train the model
logistic_model_pipeline.fit(X_train, y_train)

In [None]:
logistic_accuracy=logistic_model_pipeline.score(X_val, y_val)
print(f"Logistic Regression Accuracy: {logistic_accuracy:.4f}")

### LogisticRegression with feature scaling pipeline

In [None]:
# Define the pipeline
Logistic_model_pipeline = Pipeline(steps=[
    ("Preprocessor", Preprocessor),
    ("logistic", LogisticRegression(random_state = 0, max_iter=600))
])

# Train the model
Logistic_model_pipeline.fit(X_train, y_train)

In [None]:
Logistic_accuracy=Logistic_model_pipeline.score(X_val, y_val)
print(f"Logistic Regression Accuracy after feature scaling: {Logistic_accuracy:.4f}")

**Observation**
* Feature scaling led to a subtle yet meaningful improvement in the *Logistic Regression model's* accuracy, rising from *0.6094* to *0.6115*. This demonstrates how scaling helps standardize feature contributions, refining the model’s performance and stability. 🚀📊

## SGDClassifier

In [None]:
# Define the pipeline
sgd_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ('sgd_classifier', SGDClassifier(random_state=0))
])

# Train the model
sgd_model_pipeline.fit(X_train, y_train)

In [None]:
sgd_accuracy = sgd_model_pipeline.score(X_val, y_val)
print(f"SGD Classifier Accuracy: {sgd_accuracy:.4f}")

### Hyperparameter Tuning

In [None]:
param_grid = {
    'sgd_classifier__alpha': [0.0001, 0.001, 0.01, 0.1],
    'sgd_classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'sgd_classifier__learning_rate': ['constant', 'optimal', 'adaptive'],
}

grid_search = GridSearchCV(sgd_model_pipeline, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_
best_pipeline

In [None]:
SGD_accuracy = best_pipeline.score(X_val, y_val)
print(f"SGD Classifier Accuracy after hyperparameter tuning: {SGD_accuracy:.4f}")

**Observation**
* With hyperparameter tuning, the *SGD Classifier* saw a significant improvement, elevating its accuracy from *0.6050* to *0.6137*. This highlights the impact of fine-tuning in unlocking better model performance and precision. 🔥📈

## RandomForestClassifier

In [None]:
# Define the pipeline
random_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=0))
])

# Train the model
random_model_pipeline.fit(X_train, y_train)


In [None]:
rf_accuracy=random_model_pipeline.score(X_val, y_val)
print(f"Random Forest Accuracy: {rf_accuracy}")

## XGBClassifier

In [None]:
# Define the pipeline
xgb_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(random_state=0))
])

# Train the model
xgb_model_pipeline.fit(X_train, y_train)


In [None]:
xgb_accuracy=xgb_model_pipeline.score(X_val, y_val)
print(f"XGBoost Accuracy: {xgb_accuracy}")

### XGBClassifier with feature scaling pipeline

In [None]:
# Define the pipeline
XGB_model_pipeline = Pipeline(steps=[
    ("Preprocessor", Preprocessor),
    ("classifier", XGBClassifier(random_state=0))
])

# Train the model
XGB_model_pipeline.fit(X_train, y_train)

In [None]:
XGB_accuracy=XGB_model_pipeline.score(X_val, y_val)
print(f"XGBoost Accuracy after feature scaling: {XGB_accuracy}")

📊 **Observation on XGBoost Model Performance**
* Before Feature Scaling: ✅ Accuracy = 62.32%
* After Feature Scaling: ❌ Accuracy = 60.01%

🔍 **Key Insight:**
* Feature scaling resulted in a *2.11% drop in accuracy*, indicating that *XGBoost does not benefit from scaling* since it is a *tree-based model*. Unlike linear models, XGBoost makes decisions based on *feature splits*, not numerical magnitudes, making scaling unnecessary.

## LGBMClassifier

In [None]:
# Define the pipeline
lgbm_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LGBMClassifier(n_estimators=400, learning_rate=0.05, max_depth=30, random_state=0, n_jobs=-1))
])

# Train the model
lgbm_model_pipeline.fit(X_train, y_train)

In [None]:
lgbm_accuracy=lgbm_model_pipeline.score(X_val, y_val)
print(f"LightGBM Accuracy: {lgbm_accuracy}")

In [None]:
# y_pred=lgbm_model_pipeline.predict(test_data)

# submission=pd.DataFrame({"id":range(0,test_data.shape[0]),
#                          "target":y_pred
# })

# submission.to_csv('submission.csv',index=False)

### LGBMClassifier with feature scaling pipeline

In [None]:
# Define the pipeline
LGBM_model_pipeline = Pipeline(steps=[
    ("Preprocessor", Preprocessor),
    ("classifier", LGBMClassifier(random_state=0, n_jobs=-1))
])

# Train the model
LGBM_model_pipeline.fit(X_train, y_train)

In [None]:
LGBM_accuracy=LGBM_model_pipeline.score(X_val, y_val)
print(f"LightGBM Accuracy after feature scaling: {LGBM_accuracy}")

📊 **Observation on LightGBM Model Performance**
* Before Feature Scaling: ✅ Accuracy = 63.25%
* After Feature Scaling: ❌ Accuracy = 61.31%

🔍 **Key Insight:**
* Applying *feature scaling reduced the model's accuracy* by approximately *2%*. Since *LightGBM does not require feature scaling*, this drop suggests that scaling may have *distorted feature distributions*, leading to suboptimal splits in decision trees.

# Comparing Model Performance

In [None]:
# Define model names and their corresponding accuracy scores
models = ["Logistic Regression", "SGD Classifier", "Random Forest", "XGBoost", "LightGBM"]
accuracies = [logistic_accuracy, SGD_accuracy, rf_accuracy, xgb_accuracy, lgbm_accuracy]

# Set figure size and style
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Creating barplot
ax = sns.barplot(x=models, y=accuracies, palette="viridis")

# Adding accuracy values on top of the bars
for index, value in enumerate(accuracies):
    plt.text(index, value + 0.005, f"{value:.4f}", ha='center', fontsize=12, fontweight='bold')

# Customize graph appearance
plt.ylim(min(accuracies) - 0.02, max(accuracies) + 0.02)  # Adjust y-axis range
plt.xlabel("Models", fontsize=14, fontweight="bold")
plt.ylabel("Accuracy", fontsize=14, fontweight="bold")
plt.title("Model Performance Comparison", fontsize=16, fontweight="bold", color='darkblue')

# Rotate x-axis labels for better readability
plt.xticks(rotation=15)

# Display the plot
plt.show()


**Observations:**
* *LightGBM* achieved the highest accuracy (0.6326), making it the best-performing model in this comparison.
* *XGBoost* followed closely (0.6232), showing strong performance among ensemble methods.
* *Random Forest* performed moderately (0.6191), slightly lower than XGBoost.
* *Logistic Regression* (0.6094) and *SGD Classifier* (0.6137) had the lowest accuracy, indicating that linear models may not be the best fit for this dataset.
  
**Key Takeaways:**
* *Ensemble models (LightGBM & XGBoost) outperformed traditional classifiers*, highlighting their effectiveness in handling complex patterns.
* *Linear models (Logistic Regression & SGD) underperformed*, suggesting that feature relationships may be non-linear.
* *Further improvements could involve hyperparameter tuning*, feature engineering, or alternative preprocessing methods.

# Final Submission

In [None]:
# Define the pipeline
final_model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LGBMClassifier(n_estimators=400, learning_rate=0.05, max_depth=30, random_state=0, n_jobs=-1))
])

# Train the model
final_model_pipeline.fit(X_train, y_train)

In [None]:

y_pred=final_model_pipeline.predict(test_data)

submission=pd.DataFrame({"id":range(0,test_data.shape[0]),
                         "target":y_pred
})

submission.to_csv('submission.csv',index=False)