In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd #data preprocessing,CSV files I/O(e.g pd.read_csv)
import numpy as np#linear algebra
import matplotlib.pyplot as plt# data visualisation
import seaborn as sns#statistical data visualisation

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):# used to transverse a directory tree
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#uploading the file 
train_df=pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
test_df=pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')

**GETTING BASIC INFORMATION ABOUT DATASET**

In [None]:
train_df.shape

The training datasets consist of 100000 rows and 76 columns.

In [None]:
train_df.head(5)

**EXPLORATORY DATA ANALYSIS**

In [None]:
train_df.columns

> The training datasets contains 31 float features, 17 integer features and 28 object features> .

In [None]:
train_df.head(5)

In [None]:
#missing values 
missing_values=train_df.isnull().sum().sum()
missing_values

> There are total 6533 missing values in training datasets

**SUMMARY STATISTICS**

In [None]:
train_df.describe()

In [None]:
numerical=train_df.select_dtypes(include=['int64','float64'])
numerical


> There are total 48 numerical features 
> 1. IsBetaUser can be dropped  as it has only "0" everywhere
> 2. ReaTimeProtectionState can also be ignored as it is highly skewed and lacks variability, making it not recommended for analysis
> 3. IsPassiveModeEnabled can be ignored because this variable's distribution is highly skewed and uninformative, as most of the values are concentrated at 0.
> 4. NumAntivirusProductsEnabled and HasTPM can be ignored as the majority of data is concentrated at 1.
> 5. IsTouchEnabled, IsAlwaysOnAlwaysConnectedCapable, and IsPenCapable can be ignores as it is highly skewed because majority of datasets contains '0'.
> 6. PlatoformType can be ignore as it lacks variablility and is highly skewed.
> 7. AutoSamplesubmission can be dropped as it has '0' everywhere.
> 8. SMode can be dropped as it has highly skewed similarly Enable LUA , FirewallEnabled ,ProcessorManufacturerID,HasOpticalDiskDrive,ISportableOS, ISflightEnabled can be ignored. 


In [None]:
## Descriptive statistics for categorical feature##
categorical=train_df.describe(include=['object'])
categorical

> 1. MachineID has 99,835 unique values which is not helpful for modeling
> 2. ProductName has win8defender with 99.7% dominancy hence can be ignore.
> 3. PlatformType has dominance of 'windows10' and hence can be ignored for model building as distribution is skewed.
> 4. Processor has dominacy of 'x64'and has outlier  can be ignore for model building as distribution is skewed.
> 5. OSVersion has dominancy of '10.0.0.0' so it can also be ignore for model building as distribution is skewed.
> 6. OSGenuineState has dominancy of of IS_GENUINE so it can be ignore for model building as distribution is skewed.
> 7. FlightRing can also be ignore as its lack in variability and is skewed.
> 8. DeviceFamily can also be drpped as it has 2 unique values but frequency of one unique value is very high and with similar reason OSArchitechture ca also be dropped. 


In [None]:
target_counts = train_df['target'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', startangle=90, colors=['orange', 'lightgreen'])
plt.title('Distribution of Target Classes')
plt.show()

> This pie chart indicated that target variable is uniformly distributed.

In [None]:
train_df.columns

In [None]:

train_df['DateAS']=pd.to_datetime(train_df['DateAS'],errors='coerce')
train_df['month_as']=train_df['DateAS'].dt.month
train_df['month_as']
test_df['DateAS']=pd.to_datetime(test_df['DateAS'],errors='coerce')
test_df['month_as']=test_df['DateAS'].dt.month
test_df['month_as']

In [None]:
train_df.columns

**Dropping the redundant columns**

In [None]:
# List of columns to drop based on analysis
columns_to_drop = [
    "IsBetaUser", "RealTimeProtectionState", "IsPassiveModeEnabled", 
    "NumAntivirusProductsEnabled", "HasTpm", "IsTouchEnabled", 
    "IsAlwaysOnAlwaysConnectedCapable", "IsPenCapable", "PlatformType",
     "SMode", "EnableLUA", "FirewallEnabled", 
    "ProcessorManufacturerID", "HasOpticalDiskDrive", "IsPortableOS", 
     "MachineID", "ProductName", "PlatformType", 
    "Processor", "OSVersion", "OSGenuineState", "FlightRing", 
    "DeviceFamily", "OSArchitecture"
]

# Drop the specified columns
train_df_cleaned = train_df.drop(columns=columns_to_drop, axis=1)
test_df_cleaned=test_df.drop(columns=columns_to_drop, axis=1)

# Display the shape of the dataset before and after dropping columns
print("Original dataset shape:", train_df.shape)
print("Cleaned dataset shape:", train_df_cleaned.shape)
print("Original dataset shape:", test_df.shape)
print("Cleaned dataset shape:", test_df_cleaned.shape)
train_df_cleaned.to_csv("cleaned_train.csv", index=False)
# Display the first few rows of the cleaned dataset
train_df_cleaned.head()
train_df=train_df_cleaned
test_df=test_df_cleaned



**Calculating missing values percentage and plotting missing values via matplotlib library**

In [None]:
missing_values_train=train_df.isnull().sum().sum()
missing_values


In [None]:

print(missing_values[missing_values > 0])

# Missing values percentage
missing_values = train_df.isnull().sum() / len(train_df) * 100
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Plot missing values
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_values.index, y=missing_values.values, palette="coolwarm")
plt.xticks(rotation=90)
plt.ylabel("Missing Value Percentage")
plt.title("Missing Values in Training Data")
plt.show()


In [None]:
drop_col=['CityID','IsGamer','RegionIdentifier','InternalBatteryNumberOfCharges']
train_df_hm=train_df.drop(columns=drop_col, axis=1)
train_df_hm.shape

In [None]:
# Correlation heatmap for numerical features
plt.figure(figsize=(12, 8))
corr_matrix = train_df_hm.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

> From the heatmap it can be say that :
> 1.OSBuildNumber and OSBuildNumberOnly shows highly correaltion hence one of them can be dropped
> PrimaryDisplayResolutionHorizontal and PrimaryDisplayResolutionHorizontal are also shows same property hence one of them can be dropped
> PrimaryDiskCapacityMB and SystemVolumeCapacityMB are highly correlated so one can be dropped

In [None]:
# List of columns to drop based on high correlation
col_to_drop = [
    "OSBuildNumberOnly",  # Dropping one of the highly correlated OS build features
    "PrimaryDisplayResolutionHorizontal"  ,# Dropping one of the highly correlated resolution features
    "PrimaryDiskCapacityMB" #Dropping highly correlated memory feature
]
# Drop the specified columns
df_cleaned = train_df.drop(columns=col_to_drop, axis=1)
df_cleaned_test = test_df.drop(columns=col_to_drop, axis=1)
# Display the shape of the dataset before and after dropping columns
print("Original dataset shape:", train_df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)
print("Original dataset shape:", test_df.shape)
print("Cleaned dataset shape:", df_cleaned_test.shape)
df_cleaned.columns
train_df=df_cleaned
test_df=df_cleaned_test



Date Time

**Univariate Analysis**

In [None]:
columns = ['NumAntivirusProductsInstalled','IsSystemProtected','OSInstallLanguageID',
          'OSBuildNumber', 'IsSecureBootEnabled', 'IsGamer']
# Create a 2-row, 3-column subplot grid
fig, axes = plt.subplots(2, 3, figsize=(15, 12))  # Corrected subplot initialization
# Flatten axes array for easy iteration
axes = axes.flatten()
# Iterate over each feature and plot
for i, col in enumerate(columns):  # Corrected loop structure
    sns.countplot(data=train_df, x=col, ax=axes[i])  # Plot countplot for each feature
    axes[i].set_title(f'Distribution of {col}')  # Set title
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)  # Rotate x labels for readability
# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
columns_cat = ['EngineVersion', 'AppVersion', 'SignatureVersion',
       'OsPlatformSubRelease', 'OSBuildLab', 'SKUEditionName']
# Create a 2-row, 3-column subplot grid
fig, axes = plt.subplots(2, 3, figsize=(15, 12))  # Corrected subplot initialization
# Flatten axes array for easy iteration
axes = axes.flatten()
# Iterate over each feature and plot
for i, col in enumerate(columns_cat):  # Corrected loop structure
    sns.countplot(data=train_df, x=col, ax=axes[i])  # Plot countplot for each feature
    axes[i].set_title(f'Distribution of {col}')  # Set title
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)  # Rotate x labels for readability
# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

**Multivariate analysis**

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=train_df,x='IsGamer',hue='target')
plt.title('Target Vs IsGamer Distribution')
plt.xlabel('IsGamer')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Target')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=train_df,x='OSBuildLab',hue='target')
plt.title('Target VsOSBuildLab Distribution')
plt.xlabel('IsGamer')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Target')
plt.tight_layout()
plt.show()

**Feature Extraction**

In [None]:
#Extracting month from DateAs and DateOS feature
train_df['DateAS']=pd.to_datetime(train_df['DateAS'],errors='coerce')
train_df['month_as']=train_df['DateAS'].dt.month
train_df['month_as']
train_df['DateOS']=pd.to_datetime(train_df['DateOS'],errors='coerce')
train_df['month_os']=train_df['DateOS'].dt.month
train_df['month_os']
test_df['DateAS']=pd.to_datetime(test_df['DateAS'],errors='coerce')
test_df['month_as']=test_df['DateAS'].dt.month
test_df['month_as']
test_df['DateOS']=pd.to_datetime(test_df['DateOS'],errors='coerce')
test_df['month_os']=test_df['DateOS'].dt.month
test_df['month_os']


In [None]:
#dropping DateAs and DateOS feature after adding new month_as and month_os feature for training and test dta.
drop=['DateAS','DateOS']
train_df=train_df.drop(columns=drop, axis=1)
print(train_df.shape)
drop=['DateAS','DateOS']
test_df=test_df.drop(columns=drop, axis=1)
print(test_df.shape)


**Splitting the data into train and validation set**

In [None]:
features=train_df.drop(columns='target')
labels=train_df['target']


In [None]:
X_train,X_val,y_train,y_val=train_test_split(features,labels,test_size=0.2,random_state=42)


****Spilting the X into numerical and categorical features  ****

In [None]:
categorical=features.select_dtypes(include=['object']).columns
numerical=features.select_dtypes(include=['int','float64']).columns

*Finding unique values in categorical feature

In [None]:
for col in categorical:
    print(col,features[col].nunique())

**Preprocessing the data**

In [None]:
cat_pipe=Pipeline(steps=[
('imputation',SimpleImputer(strategy='most_frequent')),
    ('encoding',OneHotEncoder(sparse=False,handle_unknown='ignore'))
])
cat_pipe

> 

> This categorical pipeline (cat_pipe) first fills missing values with the most frequent category using SimpleImputer, then converts categorical variables into one-hot encoded vectors using OneHotEncoder. The handle_unknown='ignore' ensures unseen categories in test data don’t cause errors. 🚀

In [None]:
num_pipe=Pipeline(steps=[
('imputation',SimpleImputer(strategy='mean')),
    ('scaling',StandardScaler())
])
num_pipe

> This numerical pipeline (num_pipe) first fills missing values with the mean using SimpleImputer, then standardizes numerical features to have zero mean and unit variance using StandardScaler. This helps improve model performance by ensuring features are on the same scale. 

In [None]:
transformer=ColumnTransformer(transformers=[
    ('cat',cat_pipe, categorical),
    ('num',num_pipe,numerical)
])
transformer

>This ColumnTransformer (transformer) applies cat_pipe to categorical features and num_pipe to numerical features, ensuring separate preprocessing for each type. It efficiently transforms the dataset by handling categorical encoding and numerical scaling in a single step.  

In [None]:
X_train_processed=transformer.fit_transform(X_train)
X_val_processed=transformer.transform(X_val)
test_processed=transformer.transform(test_df)


> These transformations ensure consistent preprocessing across datasets, with fit_transform(X_train) learning imputation, encoding, and scaling, while transform(X_val) and transform(test_df) apply the same transformations without data leakage. This keeps the model's input features standardized. 

In [None]:
X_train=pd.DataFrame(X_train_processed)
X_val=pd.DataFrame(X_val_processed)
test=pd.DataFrame(test_processed)

> These lines convert the transformed NumPy arrays back into pandas DataFrames, making it easier to analyze and use them in modeling.

**Model Building**

**Logistic Regression**

In [None]:
model=LogisticRegression(random_state=42)
model.fit(X_train,y_train)
y_pred=model.predict(X_val)
score=accuracy_score(y_val,y_pred)
score

> This code trains a Logistic Regression model on the processed training data and evaluates it on the validation set using accuracy score.

**XGBOOST Classifier**

In [None]:
from xgboost import XGBClassifier 
model_xgb=XGBClassifier()
model_xgb.fit(X_train,y_train)
y_pred=model_xgb.predict(X_val)
score=accuracy_score(y_val,y_pred)
score



**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rfc= RandomForestClassifier()
model_rfc.fit(X_train,y_train)
y_pred=model_rfc.predict(X_val)
score=accuracy_score(y_val,y_pred)
score

In [None]:
# from sklearn.svm import SVC
# svm_model=SVC(kernel='rbf',C=1.0,gamma='scale')
# svm_model.fit(X_train,y_train)
# y_pred=svm_model.predict(X_val)
# score=accuracy_score(y_val,y_pred)
# score



In [None]:
# from sklearn.neighbours import KNeighborsClassifier
# model_knn KNeighborsClassifier(n_neighbors=5)
# model_knn.fit(X_train,y_train)
# y_pred=model_knn.predict(X_val)
# score=accuracy_score(y_val,y_pred)
# score

**Feature Selection**

In [None]:
from sklearn.feature_selection import SelectKbest
selector=SelectKBest(score_func=f_classif,k=100)
X_train_selectkbest=selector.fit_transform(X_train,y_train)
X_val_selectkbest=selector.transform(X_val)
test_selectkbest=slector.transform(test)

print(X_train_selectkbest)
print(X_val_selectkbest)
print(test_selectkbest)


In [None]:
from sklearn.feature_selection import SelectFromModel
selector_sfm=selectFromModel(estimator='LogosticRegression',n_estimator=100,random_stae=42,thresold='mean')
X_train_SFM=selector_sfm.transform(X_train,y_train)
X_val_SFM=selector_sfm.transform(X_val)
test_SFM=slector_sfm.transform(test)