In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Required Imports

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Exploratory Data Analysis


In [None]:
#data loading
train=pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
test=pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')

In [None]:
train.head(5)

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
#Finding binary and single value columns 
print(train.nunique()[train.nunique() <= 2])

**Observations**

Columns with singular values i.e **[IsFlightsDisabled,AutoSampleSubmissionEnabled,IsBetaUser]** & **[MachineID]** which has no categorical use can be dropped during preprocessing to reduce number of columns

In [None]:
train.describe()

In [None]:
#finding target distribution
plt.figure(figsize=(6, 4))
dist = train['target'].value_counts()
sns.countplot(x='target', data=train, palette=['blue', 'red'])
plt.xticks(ticks=range(len(dist)), labels=dist.index)
plt.show()

In [None]:
train['target'].value_counts()

**Observations** 

The distribution of the target variable is more or less **balanced** therefore bias correction will not be required

In [None]:
#If Firewall Enabled has any effect on the target
plt.figure(figsize=(6,4))
sns.countplot(x=train['FirewallEnabled'], hue=train['target'], palette=['blue', 'red'])
plt.title("Firewall Enabled vs. System Threats")
plt.xlabel("Firewall Enabled (0: No, 1: Yes)")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

In [None]:
train.groupby('FirewallEnabled')['target'].value_counts().unstack(fill_value=0)

**Observation** 

There is not much difference in the distribution of the target variable with respect to FirewallEnabled,  

In [None]:
#If RealTimeProtectionState has any effect on the target
plt.figure(figsize=(8,4))
sns.countplot(x=train['RealTimeProtectionState'], hue=train['target'], palette=['blue', 'red'])
plt.title("Real-Time Protection vs. System Threats")
plt.xlabel("Real-Time Protection State")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observation** 
* Real-Time Protection State 0 has been able to provide more security than state 7 in terms of percentage, however, more machines have protection state 7 where occurence of system compromises is relatively more.
* There are very less machines using state 1.0, 3.0, 5.0 and 8.0

In [None]:
#PowerPlatformRole's effect on target distribution
plt.figure(figsize=(15,4))
sns.countplot(x=train['PowerPlatformRole'], hue=train['target'], palette=['blue', 'red'])
plt.title("PowerPlatformRole vs. System Threats")
plt.xlabel("PowerPlatformRole")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observations**

* Mobile devices have the highest counts, both in secure and compromised categories, with compromised devices slightly outnumbering secure ones. This suggests mobile platforms are more vulnerable to threats.
* Desktop systems show a relatively less occurence but lean towards compromised.
* Slate shows skewness towards Secure systems
* Categories such as UNKNOWN, AppliancePC, and EnterpriseServer etc have negligible data points, which might not contribute significantly to model training.

In [None]:
#OSVersion's effect on target distribution
plt.figure(figsize=(12,5))
sns.countplot(y=train['OSVersion'], hue=train['target'], palette=['blue', 'red'])
plt.title("OS Version vs. System Threats")
plt.xlabel("Count")
plt.ylabel("OS Version")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observations**

* The majority of systems, both secure and compromised, use OS version **10.0.0.0**. Other versions **(6.3.0.0, 6.1.1.0, etc.)** have significantly fewer samples.

* The proportion of secure vs. compromised systems across OS versions remains consistent, indicating no strong relationship between OS version and security status.

In [None]:
#OSInstallType's effect on target distribution
plt.figure(figsize=(12,4))
sns.countplot(x=train['OSInstallType'], hue=train['target'], palette=['blue', 'red'])
plt.title("OS Install Type vs. System Threats")
plt.xlabel("OS Install Type")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observation**

* Systems installed using UUPUpgrade and IBSClean have higher counts of compromised systems compared to secure ones.

* Other install types like Update, Upgrade, and Reset show more 'secure' distribution

* Install types such as Refresh, CleanPCRefresh, and Clean have relatively low counts overall.

* OS install type appears to be a strong feature for predicting system security, as certain types are associated with higher compromise rates. 

In [None]:
#ProcessorCoreCount's effect on target distribution
plt.figure(figsize=(8,4))
sns.boxplot(x=train['target'], y=train['ProcessorCoreCount'], palette=['blue', 'red'])
plt.title("Processor Core Count vs. System Threats")
plt.xlabel("System Threat (0: Secure, 1: Compromised)")
plt.ylabel("Processor Core Count")
plt.show()

In [None]:
proc = train.groupby('target')['ProcessorCoreCount'].agg(
    Median='median', Mean='mean',
    Q1=lambda x: x.quantile(0.25),
    Q3=lambda x: x.quantile(0.75)
)

proc['IQR'] = proc['Q3'] - proc['Q1']

print(proc[['Median', 'Mean', 'IQR']])

**Observations**

* Median core count is identical for both secure **(4)** and compromised **(4)** systems.
* Outliers with very high core counts (e.g., >50 cores) appear more frequently in compromised systems.
* The narrower IQR for compromised systems indicates that these tend to have more consistent hardware configurations, which could be leveraged in model training.

In [None]:
#IsGamer's effect on target distribution
plt.figure(figsize=(15,4))
sns.countplot(x=train['IsGamer'], hue=train['target'], palette=['blue', 'red'])
plt.title("IsGamer vs. System Threats")
plt.xlabel("IsGamer")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observations**

* Systems where IsGamer = 0 have higher counts overall, with a leaning towards secure state

* Systems where IsGamer = 1 (gaming systems) show fewer total counts but a higher proportion of compromised states relative to secure ones.

* This might be attributed to the fact that gamers tend to install more softwares and games which sometimes are pirated causing threats to system security.



In [None]:
#RegionIdentifier's effect on target distribution
plt.figure(figsize=(15,4))
sns.countplot(x=train['RegionIdentifier'], hue=train['target'], palette=['blue', 'red'])
plt.title("RegionIdentifier vs. System Threats")
plt.xlabel("RegionIdentifier")
plt.ylabel("Count")
plt.legend(["Secure", "Compromised"])
plt.show()

**Observations**

* Regions like 1, 3, 10, 11 and 15 have high counts
* Regions 1 and 10 show high instances of compromised systems compared to secure ones.
* Other regions (e.g., 4, 5, 6, etc.) have lower counts but maintain a balanced distribution between secure and compromised states with slight skewness towards secure instances.
* Region-specific trends suggest geographical factors or regional configurations might influence system security.

In [None]:
#Processor Type Distribution
plt.figure(figsize=(6, 6))
plt.pie(train['Processor'].value_counts(), labels=train['Processor'].value_counts().index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightcoral', 'lightgreen'])
plt.title('Processor Type Distribution')
plt.show()

**Observations**

* Majority systems (**91.3%**) use x64 processors while **8.7**% machines have a x86 processor.
* There are negligible amounts of machines witih arm64 processors.

# Data Preparation 

In [None]:
#data loading
train=pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
test=pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')

In [None]:
#duplicate removal
train.drop_duplicates(inplace=True)

In [None]:
#dropping single value columns and MachineID(irrelevant)
train.drop(columns=['MachineID', 'IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled'], inplace=True)
test.drop(columns=['MachineID', 'IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled'], inplace=True)

In [None]:
#Creating OS_Update_Days from DateOS and DateAS
train['DateOS'] = pd.to_datetime(train['DateOS'])
train['DateAS'] = pd.to_datetime(train['DateAS'])
train['OS_Update_Days'] = (train['DateOS'] - train['DateAS']).dt.days
test['DateOS'] = pd.to_datetime(test['DateOS'])
test['DateAS'] = pd.to_datetime(test['DateAS'])
test['OS_Update_Days'] = (test['DateOS'] - test['DateAS']).dt.days
train.drop(columns=['DateOS', 'DateAS'], inplace=True)
test.drop(columns=['DateOS', 'DateAS'], inplace=True)

In [None]:
#Preprocessing Pipeline
x = train.drop(columns=['target'])
y = train['target']

numerical_cols = x.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = x.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=5))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
])

X_processed = pipeline.fit_transform(x)
print("Processed data shape:", X_processed.shape)

In [None]:
#Test Train Split
x_train, x_test, y_train, y_test = train_test_split(X_processed,y, test_size=0.2, random_state=42)

# Model Training 

# Logistic Regression

In [None]:
#param_grid_lr = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
#grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring='roc_auc')
#grid_lr.fit(X_train_pca, y_train)
#lr = grid_lr.best_estimator_

#Not added in kaggle code due to excessive train time (tested on local system)

In [None]:
lr = LogisticRegression(C=0.1, max_iter=1000, solver='liblinear', random_state=42)
lr.fit(x_train, y_train)

In [None]:
y_pred1 = lr.predict(x_test)
print(classification_report(y_test, y_pred1))

# RandomForestClassifier

In [None]:
#param_grid_rf = {'n_estimators': [100, 200, 300, 500], 'max_depth': [15, 20, 25], 'min_samples_split': [10, 15, 20]}
#grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='roc_auc')
#grid_rf.fit(x_train, y_train)
#rfc = grid_rf.best_estimator_

#Not added in kaggle code due to excessive train time (tested on local system)

In [None]:
rfc = RandomForestClassifier(n_estimators=500, max_depth=25, min_samples_split=20, random_state=42)
rfc.fit(x_train, y_train)

In [None]:
y_pred2 = rfc.predict(x_test)
print(classification_report(y_test, y_pred2))

# AdaBoostClassifier

In [None]:
#param_grid_ab = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
#grid_ab = GridSearchCV(AdaBoostClassifier(), param_grid_ab, cv=5, scoring='roc_auc')
#grid_ab.fit(x_train, y_train)
#abc = grid_ab.best_estimator_

#Not added in kaggle code due to excessive train time (tested on local system)

In [None]:
abc=AdaBoostClassifier(learning_rate=1, n_estimators=200, random_state=42)
abc.fit(x_train, y_train)

In [None]:
y_pred3 = abc.predict(x_test)
print(classification_report(y_test, y_pred3))

# Comparison

In [None]:
#Performance Metrics

models = ['Logistic Regression', 'Random Forest', 'AdaBoost']
y_preds = [y_pred1, y_pred2, y_pred3]

metrics = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

for i, y_pred in enumerate(y_preds):
    metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['Precision'].append(precision_score(y_test, y_pred, average='weighted'))
    metrics['Recall'].append(recall_score(y_test, y_pred, average='weighted'))
    metrics['F1 Score'].append(f1_score(y_test, y_pred, average='weighted'))

bar_width = 0.2
x = np.arange(len(models))
plt.figure(figsize=(10, 6))

for i, (metric_name, values) in enumerate(metrics.items()):
    bars = plt.bar(x + i * bar_width, values, width=bar_width, label=metric_name)
    for bar in bars:
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01, 
                 f"{bar.get_height():.2f}", ha='center', fontsize=10, fontweight='bold')

plt.xticks(x + 1.5 * bar_width, models)
plt.ylabel("Score")
plt.ylim(0, 1)
plt.title("Model Comparison")
plt.legend()
plt.show()
    

In [None]:
#Confusion Matrix
plt.figure(figsize=(14, 4))
for i in range(3):
    cm = confusion_matrix(y_test, y_preds[i])
    plt.subplot(1, 3, i + 1)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{models[i]}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

plt.show()

In [None]:
results = []
for model, y_pred in zip(models, y_preds):
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = accuracy_score(y_test, y_pred)
    df_report = pd.DataFrame(report).transpose()
    df_report['Model'] = model
    df_report.loc['accuracy', 'precision'] = accuracy
    results.append(df_report)
final_report = pd.concat(results)
print(final_report)

**Observations**

* Both RandomForestClassifier and AdaBoost outperform LogisticRegression in terms of precision.
* LogisticRegression has lower precision overall **(0.53)**, which reflects a less confident classifier.
* AdaBoost performs better in recall for class 1 **(0.68)**, showing it can detect class 1 instances more effectively than RandomForestClassifier **(0.65)**.
* RandomForestClassifier has a higher recall for class 0 **(0.58)** than the other 2 models ( **0.545 and 0.55, respectively)**, which helps improve the detection of class 0 instances.
* RandomForestClassifier and AdaBoost both show better F1-scores **(0.62 and 0.61, respectively)** compared to LR **(0.53)**.
* RandomForestClassifier has a more balanced F1-score across both classes **(0.60 and 0.63)**, while AdaBoost has a slightly higher F1-score for class 1 **(0.64)** but a marginally lower one for class 0 **(0.59)**.
* RandomForestClassifier achieves the highest accuracy **(0.62)**, outperforming both AdaBoost **(0.61)** and LogisticRegression **(0.53)**.


In [None]:
#ROC Curve
model=[lr, rfc, abc]
plt.figure(figsize=(8, 6))
for i in range(3):
    y_prob = model[i].predict_proba(x_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f"{models[i]} (AUC: {auc(fpr, tpr):.2f})")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

**Observations**
* RandomForestClassifier has the highest ROC-AUC score **(0.67)**, demonstrating its ability to separate the two classes more effectively.
* AdaBoost follows closely with a ROC-AUC score of **0.66**, which also indicates strong class separation but slightly less confidence compared to RandomForestClassifier.
* LogisticRegression has the lowest ROC-AUC **(0.54)**, suggesting it has poor discriminatory power between the two classes.


# Result
Random Forest Classifier stands out as the best model among the three based on the following criteria:

* **Higher Accuracy**: RandomForestClassifier achieves the highest accuracy of 0.62, which is a significant improvement over LogisticRegression's 0.53 and slightly better than AdaBoost's's 0.61.
* **Better ROC-AUC**: RandomForestClassifier's ROC-AUC score of 0.67 indicates superior class separation, showing it can make more confident predictions.
* **Balanced Performance**: RandomForestClassifier performs well across precision, recall, and F1-score for both classes, with class 1 benefiting from higher recall (0.65) and class 0 being better detected than by the other 2 models.

# Submission.csv

In [None]:
test_processed = pipeline.transform(test)
y_pred=rfc.predict(test_processed)
output_df = pd.DataFrame({
    'id': test.index,  
    'target': y_pred
})

output_df.to_csv('/kaggle/working/submission.csv', index=False)