READ
DOI: https://archive.ics.uci.edu/dataset/327/phishing+websites



In [3]:
import arff
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# ========== ƒê∆∞·ªùng d·∫´n ==========
BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "phishing.arff")

# ========== ƒê·ªçc d·ªØ li·ªáu t·ª´ file .arff ==========
with open(DATA_PATH, "r") as f:
    data = arff.load(f)

columns = [col[0] for col in data["attributes"]]
df = pd.DataFrame(data["data"], columns=columns).astype(int)

# ========== T·ªïng quan ==========
print("üßæ T·ªïng quan v·ªÅ Dataset:")
print(f"- S·ªë d√≤ng (samples): {df.shape[0]}")
print(f"- S·ªë thu·ªôc t√≠nh (features): {df.shape[1] - 1}")
print("\nüìå Danh s√°ch c√°c thu·ªôc t√≠nh:")
for i, col in enumerate(df.columns[:-1]):
    print(f"{i+1:2d}. {col}")

# ========== Ph√¢n t√≠ch ƒë·∫∑c tr∆∞ng ==========
X = df.drop("Result", axis=1)
y = df["Result"]

# D√πng RandomForest ƒë·ªÉ ƒë√°nh gi√° m·ª©c ƒë·ªô quan tr·ªçng
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_

# T·∫°o DataFrame ƒë·ªÉ hi·ªÉn th·ªã r√µ h∆°n
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nüî• 10 ƒë·∫∑c tr∆∞ng quan tr·ªçng nh·∫•t (theo RandomForest):")
print(importance_df.head(10).to_string(index=False))

# ========== Top ƒë·∫∑c tr∆∞ng theo Mutual Information ==========
selector = SelectKBest(score_func=mutual_info_classif, k=10)
selector.fit(X, y)
mi_scores = pd.DataFrame({
    "Feature": X.columns,
    "MI Score": selector.scores_
}).sort_values(by="MI Score", ascending=False)

print("\nüí° 10 ƒë·∫∑c tr∆∞ng quan tr·ªçng nh·∫•t (theo Mutual Information):")
print(mi_scores.head(10).to_string(index=False))


üßæ T·ªïng quan v·ªÅ Dataset:
- S·ªë d√≤ng (samples): 11055
- S·ªë thu·ªôc t√≠nh (features): 30

üìå Danh s√°ch c√°c thu·ªôc t√≠nh:
 1. having_IP_Address
 2. URL_Length
 3. Shortining_Service
 4. having_At_Symbol
 5. double_slash_redirecting
 6. Prefix_Suffix
 7. having_Sub_Domain
 8. SSLfinal_State
 9. Domain_registeration_length
10. Favicon
11. port
12. HTTPS_token
13. Request_URL
14. URL_of_Anchor
15. Links_in_tags
16. SFH
17. Submitting_to_email
18. Abnormal_URL
19. Redirect
20. on_mouseover
21. RightClick
22. popUpWidnow
23. Iframe
24. age_of_domain
25. DNSRecord
26. web_traffic
27. Page_Rank
28. Google_Index
29. Links_pointing_to_page
30. Statistical_report

üî• 10 ƒë·∫∑c tr∆∞ng quan tr·ªçng nh·∫•t (theo RandomForest):
                    Feature  Importance
             SSLfinal_State    0.318529
              URL_of_Anchor    0.262463
                web_traffic    0.070082
          having_Sub_Domain    0.060848
              Links_in_tags    0.041492
              Prefix_S

In [4]:
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [5]:
# Xem s·ªë l∆∞·ª£ng m·ªói lo·∫°i nh√£n
print(df['Result'].value_counts())

# Xem v√†i d√≤ng c√≥ nh√£n = 1
print("üîé C√°c trang Result = 1")
print(df[df['Result'] == 1].head())

# Xem v√†i d√≤ng c√≥ nh√£n = -1
print("üîé C√°c trang Result = -1")
print(df[df['Result'] == -1].head())

Result
 1    6157
-1    4898
Name: count, dtype: int64
üîé C√°c trang Result = 1
    having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
4                   1           0                  -1                 1   
5                  -1           0                  -1                 1   
8                   1           0                  -1                 1   
10                  1           1                   1                 1   
14                  1           1                  -1                 1   

    double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  \
4                          1             -1                  1   
5                         -1             -1                  1   
8                          1             -1                  1   
10                         1             -1                  0   
14                         1              1                 -1   

    SSLfinal_State  Domain_registeration_length  Favicon  ...  popUpWi

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Gi·∫£ s·ª≠ b·∫°n ƒë√£ c√≥ df v√† t√°ch X, y t·ª´ tr∆∞·ªõc
X = df.drop('Result', axis=1)
y = df['Result']

# Chia d·ªØ li·ªáu 80% train ‚Äì 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# In v√†i nh√£n th·ª±c t·∫ø v√† d·ª± ƒëo√°n
for i in range(10):
    print(f"Th·∫≠t: {y_test.iloc[i]}, D·ª± ƒëo√°n: {y_pred[i]}")


Th·∫≠t: -1, D·ª± ƒëo√°n: -1
Th·∫≠t: -1, D·ª± ƒëo√°n: -1
Th·∫≠t: -1, D·ª± ƒëo√°n: -1
Th·∫≠t: 1, D·ª± ƒëo√°n: 1
Th·∫≠t: 1, D·ª± ƒëo√°n: 1
Th·∫≠t: 1, D·ª± ƒëo√°n: 1
Th·∫≠t: 1, D·ª± ƒëo√°n: 1
Th·∫≠t: -1, D·ª± ƒëo√°n: -1
Th·∫≠t: -1, D·ª± ƒëo√°n: -1
Th·∫≠t: -1, D·ª± ƒëo√°n: 1


In [9]:
print(df["Result"].value_counts())



Result
 1    6157
-1    4898
Name: count, dtype: int64
