# Cyber Security Project

 **Objective:** Train algorithms to automate employees protection

## Malicious URL Detection

In [None]:
#@title Packages
!pip install -qq itables

In [None]:
#@title Libraries
from urllib.parse import urlsplit
#from itables import init_notebook_mode
#from google.colab.data_table import DataTable
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warn

#@title Settings

DataTable.max_columns = 32
init_notebook_mode(all_interactive=True)
sns.set_theme()

In [None]:
#@title Read DataFrame

df = pd.read_csv(r"C:\Users\abhil\Downloads\malicious_phish.csv\malicious_phish.csv")
df.head(50)

In [None]:
#@title General Info
df.describe()

In [None]:
df.info()

In [None]:
#@title Distribution
plt.figure()

df['type'].value_counts().plot(kind='pie')
plt.title('Labels')
plt.gca().set_ylabel('')
plt.show()

# Preprocessing

In [None]:
#@title Extract Features

df['ext_type'] = df['url'].apply(lambda x: urlsplit(x).path.rsplit('.', 1)[-1] if '.' in urlsplit(x).path else 'None')

value_counts = df['ext_type'].value_counts()
values_to_replace = value_counts.nlargest(8).index

df.loc[~df['ext_type'].isin(values_to_replace), 'ext_type'] = 'None'

df['has_http'] = df['url'].str.contains('http://').astype(int)

df['has_org'] = df['url'].str.contains('org').astype(int)

df['has_query'] = df['url'].str.contains('\?').astype(int)

df['count_routes'] = df['url'].str.replace('http://', '').str.count('/')

df['count_routes'] = df['count_routes'].where(df['count_routes'] <= 4, 4)

In [None]:
df.iloc[:,1:].head()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(pd.get_dummies(df.iloc[:, 1:]).corr())
plt.title('Correlation Matrix')
plt.show()

In [None]:
y_mapping = {
    0:'benign',
    1:'defacement', 
    2:'malware', 
    3:'phishing'
}

df['count_routes'] = df['count_routes'].astype(str)

X,y = pd.get_dummies(df.iloc[:, 2:]), df['type'].replace({v: k for k, v in y_mapping.items()})

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Perform random undersampling to balance the classes
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Split samples
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1337)

print('X_train shape is', X_train.shape)
print('y_train shape is', y_train.shape)
print('X_test shape is', X_test.shape)
print('y_test shape is', y_test.shape)

print(pd.value_counts(y_train))

# !pip install -qq tensorflowjs

In [None]:
#import tensorflowjs as tfjs
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score

num_classes = 4

# Define the neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')  # num_classes is the number of classes in your target variable
])

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'],)

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

# Make predictions on the testing data
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from xgboost import XGBClassifier

# create a default XGBoost classifier
model = XGBClassifier(n_estimators=500, random_state=42,objective='binary:logistic')


# Instantiate the XGBoost classifier for multi-class classification
model = XGBClassifier(objective='multi:softmax')

# Train the model
model.fit(X_train, y_train, verbose=True)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train,y_train)
# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set: ", accuracy)

In [None]:
y_pred

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

sns.heatmap(confusion_matrix(y_test, y_pred), xticklabels=[v for v in y_mapping.values()], 
            yticklabels=[v for v in y_mapping.values()], annot=True, fmt=".0f")

plt.title('Confusion Matrix')
plt.ylabel('Predicted')
plt.xlabel('Real')

In [None]:
X_test

In [None]:
#test = pd.read_csv(filename)
#test = test.values[:, 0:42] 
y_pred = classifier.predict(X_test)
for i in range(len(X_test)):
    print(y_pred[i])
    if str(y_pred[i]) == '1.0':
        text.insert(END,'Test Record No : '+str(index)+' Predicted : Cartilage Change/Progression Predicted\n')
    else:
        text.insert(END,'Test Record No : '+str(index)+' Predicted : No Cartilage Change/Progression Predicted\n') 

In [None]:
import pickle

# Assuming you have a trained XGBClassifier object called 'model'
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
import tensorflowjs as tfjs

tfjs.converters.save_keras_model(model, 'content/')

In [None]:
import tensorflow as tf

# Supondo que você tenha um modelo treinado chamado 'model'

# Salvar o modelo em formato JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

# Salvar os pesos do modelo em formato HDF5
model.save_weights("model_weights.h5")

print("Modelo salvo em JSON com sucesso.")