# تحليل الثغرات الأمنية باستخدام الذكاء الاصطناعي

هذا الدفتر سيقوم بتحليل البيانات وبناء نموذج للكشف عن الثغرات الأمنية

In [1]:
# استيراد المكتبات اللازمة
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import plotly.express as px
import plotly.graph_objects as go
import re
import pickle

# تعيين نمط العرض للرسومات البيانية
plt.style.use('seaborn')
sns.set_palette('Set2')
#%matplotlib inline

In [2]:
# قراءة البيانات
df = pd.read_csv('malicious_phish.csv')
print('شكل البيانات:', df.shape)
df.head()

شكل البيانات: (651191, 2)


Unnamed: 0,url,id
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
# تحليل أولي للبيانات
print('\nمعلومات عن البيانات:')
df.info()

print('\nإحصائيات وصفية:')
df.describe()


معلومات عن البيانات:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   id      651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB

إحصائيات وصفية:


Unnamed: 0,url,id
count,651191,651191
unique,641119,4
top,http://style.org.hc360.com/css/detail/mysite/s...,benign
freq,180,428103


In [4]:
# تحليل القيم المفقودة
missing_values = df.isnull().sum()
print('\nالقيم المفقودة في كل عمود:')
print(missing_values[missing_values > 0])


القيم المفقودة في كل عمود:
Series([], dtype: int64)


In [5]:
def extract_features(url):
    features = {}
    
    # Length of URL
    features['url_length'] = len(url)
    
    # Count of special characters
    features['special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
    
    # Count of digits
    features['digits'] = len(re.findall(r'\d', url))
    
    # Presence of suspicious words
    suspicious_words = ['login', 'bank', 'account', 'secure', 'update']
    features['suspicious_words'] = sum(word in url.lower() for word in suspicious_words)
    
    return features

# Extract features
print("Extracting features...")
features_list = []
for url in df['url']:
    features_list.append(extract_features(url))


Extracting features...


In [6]:
# Load data
print("Loading data...")
df = pd.read_csv('malicious_phish.csv')

Loading data...


In [7]:
# Convert features to DataFrame
X = pd.DataFrame(features_list)
y = df['id']

In [8]:
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Splitting data...


In [9]:
# Train model
print("Training model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Training model...


RandomForestClassifier(random_state=42)

In [10]:
# Evaluate model
print("\nModel Evaluation:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Model Evaluation:
              precision    recall  f1-score   support

      benign       0.83      0.93      0.87     85778
  defacement       0.69      0.73      0.71     19104
     malware       0.88      0.78      0.83      6521
    phishing       0.72      0.31      0.43     18836

    accuracy                           0.80    130239
   macro avg       0.78      0.69      0.71    130239
weighted avg       0.79      0.80      0.78    130239



In [11]:
# Save model
print("\nSaving model...")
with open('security_model', 'wb') as f:
    pickle.dump(model, f)

print("Model saved as 'security_model'")


Saving model...
Model saved as 'security_model'
