# Machine Learning Algorithms

#### Part1: supervised learning

Import you data and perform basic data exploration phase

    Display general information about the dataset

    Create a pandas profiling reports to gain insights into the dataset

    Handle Missing and corrupted values

    Remove duplicates, if they exist

    Handle outliers, if they exist

    Encode categorical features

Prepare your dataset for the modelling phase

Apply Decision tree, and plot its ROC curve

Try to improve your model performance by changing the model hyperparameters

### Part 1: unsupervised learning

Drop out the target variable

Apply K means clustering and plot the clusters

Find the optimal K parameter

Interpret the results

In [1]:
# Import Libraries and Load Data

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, auc, classification_report

from sklearn.preprocessing import LabelEncoder

from pandas_profiling import ProfileReport


# Load the dataset

url = (r"C:\Users\User\Desktop\gomycode\Machine Learning\Microsoft_malware_dataset_min.csv")

data = pd.read_csv(url)

  from pandas_profiling import ProfileReport


In [2]:
data.Census_IsVirtualDevice.unique()

array([ 0., nan,  1.])

In [3]:
# Display general information about the dataset

print(data.info())

print(data.describe())


# Create a pandas profiling report

profile = ProfileReport(data, title="Pandas Profiling Report", explorative=True)

profile.to_file("microsoft_malware_report.html")  # Save the report



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   HasDetections               100000 non-null  int64  
 1   Wdft_IsGamer                96585 non-null   float64
 2   Census_IsVirtualDevice      99815 non-null   float64
 3   Census_OSEdition            100000 non-null  object 
 4   Census_HasOpticalDiskDrive  100000 non-null  int64  
 5   Firewall                    98924 non-null   float64
 6   SMode                       94068 non-null   float64
 7   IsProtected                 99609 non-null   float64
 8   OsPlatformSubRelease        100000 non-null  object 
 9   CountryIdentifier           100000 non-null  int64  
dtypes: float64(5), int64(3), object(2)
memory usage: 7.6+ MB
None
       HasDetections  Wdft_IsGamer  Census_IsVirtualDevice  \
count  100000.000000  96585.000000            99815.000000   
mea

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Remove duplicates if they exist

data.drop_duplicates(inplace=True)

### Handle Outliers

In [10]:
plt.savefig('my_plot.png')  # Save the plot as a PNG file

In [12]:
Q1 = data['Census_IsVirtualDevice'].quantile(0.25)

Q3 = data['Census_IsVirtualDevice'].quantile(0.75)

In [14]:
import pandas as pd

import matplotlib.pyplot as plt


# Assuming 'data' is your DataFrame and it has been defined earlier

# Make sure to replace 'Census_IsVirtualDevice' with the actual column name if it's different

Q1 = data['Census_IsVirtualDevice'].quantile(0.25)

Q3 = data['Census_IsVirtualDevice'].quantile(0.75)


IQR = Q3 - Q1


# Example plot (make sure to create a plot before calling plt.show())

plt.figure()

plt.boxplot(data['Census_IsVirtualDevice'])

plt.title('Boxplot of Census_IsVirtualDevice')

plt.ylabel('Values')

plt.grid()


# If you're in a non-interactive environment, save the plot instead

plt.savefig('my_plot.png')  # Save the plot as a PNG file

# plt.show()  # Uncomment this if you're in an interactive environment

### Encode Categorical Features

In [None]:
# Convert categorical features to numerical using label encoding or one-hot encoding

label_encoder = LabelEncoder()

data[Census_IsVirtualDevice] = label_encoder.fit_transform(data[Census_IsVirtualDevice])

# Repeat for other categorical columns as necessary

### Prepare Dataset for Modeling

In [None]:
# Select features and target variable

X = data.drop('Census_IsVirtualDevice', axis=1)  # Replace 'target_variable' with the actual target column name

y = data['Census_IsVirtualDevice']


# Split the dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Apply Decision Tree and Plot ROC Curve

In [None]:
# Train a Decision Tree Classifier

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_classifier.fit(X_train, y_train)


# Make predictions

y_pred = dt_classifier.predict(X_test)


# Calculate ROC curve

fpr, tpr, thresholds = roc_curve(y_test, dt_classifier.predict_proba(X_test)[:, 1])

roc_auc = auc(fpr, tpr)


# Plot ROC curve

plt.figure()

plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], color='red', linestyle='--')

plt.xlim([0.0, 1.0])

plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('Receiver Operating Characteristic')

plt.legend(loc="lower right")

plt.show()



### Improve Model Performance by Changing Hyperparameters

In [None]:
# Example: Hyperparameter tuning using GridSearchCV

from sklearn.model_selection import GridSearchCV


param_grid = {

    'criterion': ['gini', 'entropy'],

    'max_depth': [None, 10, 20, 30],

    'min_samples_split': [2, 5, 10]

}


grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)

grid_search.fit(X_train, y_train)


# Best parameters

print("Best parameters found: ", grid_search.best_params_)

#### Drop the Target Variable

* undefined