In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)
plt.rcParams["figure.figsize"] = (8, 4)

# =========================
# 1. Detectar BASE_DIR
# =========================
cwd = Path.cwd()

def is_project_root(path: Path) -> bool:
    markers = ["requirements.txt", "app", "notebooks"]
    return any((path / m).exists() for m in markers)

if is_project_root(cwd):
    BASE_DIR = cwd
elif cwd.name in {"notebooks", "src"} and is_project_root(cwd.parent):
    BASE_DIR = cwd.parent
else:
    BASE_DIR = cwd

NOTEBOOKS_DIR = BASE_DIR / "notebooks"
NB_DATA_DIR = NOTEBOOKS_DIR / "data"
OUTPUT_DIR = NB_DATA_DIR / "data_processed"

PATH_KAGGLE_CLEAN = OUTPUT_DIR / "xss_kaggle_clean.csv"
PATH_GITHUB_CLEAN = OUTPUT_DIR / "xss_github_clean.csv"

print("CWD             :", cwd)
print("BASE_DIR        :", BASE_DIR)
print("OUTPUT_DIR      :", OUTPUT_DIR)
print("KAGGLE_CLEAN    :", PATH_KAGGLE_CLEAN)
print("GITHUB_CLEAN    :", PATH_GITHUB_CLEAN)

# =========================
# 2. Cargar datasets limpios
# =========================
df_kaggle = pd.read_csv(PATH_KAGGLE_CLEAN)
df_github = pd.read_csv(PATH_GITHUB_CLEAN)

print("\n=== Shapes ===")
print("Kaggle clean :", df_kaggle.shape)
print("GitHub clean :", df_github.shape)

print("\n=== Columnas Kaggle ===")
print(df_kaggle.columns.tolist())

print("\n=== Columnas GitHub ===")
print(df_github.columns.tolist())

# Distribución de Label si existe
if "Label" in df_kaggle.columns:
    print("\n=== Label en Kaggle ===")
    print(df_kaggle["Label"].value_counts(dropna=False))

if "Label" in df_github.columns:
    print("\n=== Label en GitHub ===")
    print(df_github["Label"].value_counts(dropna=False))

# =========================
# 3. Ver primeras filas clave
# =========================

def safe_preview(df, name):
    print(f"\n=== Preview {name} ===")
    cols_prefer = [c for c in ["Sentence", "Sentence_clean", "Payloads", "Sentence_decoded",
                               "Label", "families", "families_str", "len_after_clean", "source", "Class"]
                   if c in df.columns]
    display(df[cols_prefer].head())

safe_preview(df_kaggle, "Kaggle clean")
safe_preview(df_github, "GitHub clean")

print("\n=== Tipos de datos Kaggle ===")
print(df_kaggle.dtypes)

print("\n=== Tipos de datos GitHub ===")
print(df_github.dtypes)


CWD             : d:\Archivos de Usuario\Documents\xss-cookie\notebooks
BASE_DIR        : d:\Archivos de Usuario\Documents\xss-cookie
OUTPUT_DIR      : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed
KAGGLE_CLEAN    : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_kaggle_clean.csv
GITHUB_CLEAN    : d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_github_clean.csv

=== Shapes ===
Kaggle clean : (13438, 7)
GitHub clean : (28297, 14)

=== Columnas Kaggle ===
['Sentence', 'Label', 'families', 'len_chars', 'Sentence_clean', 'len_after_clean', 'families_str']

=== Columnas GitHub ===
['Payloads', 'Class', 'len_chars', 'IS_URL', 'HAS_XSS', 'IS_CODE', 'Sentence_decoded', 'len_decoded', 'payload_extracted', 'families', 'Label', 'len_after_clean', 'Sentence_clean', 'source']

=== Label en Kaggle ===
1    7373
0    6065
Name: Label, dtype: int64

=== Label en GitHub ===
0    14743
1    13554
Name: Label, dtype: 

Unnamed: 0,Sentence,Sentence_clean,Label,families,families_str,len_after_clean
0,"<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...","<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...",0,['image_tag'],image_tag,557
1,"<tt onmouseover=""alert(1)"">test</tt>","<tt onmouseover=""alert(1)"">test</tt>",1,['event_handler'],event_handler,36
2,"\t </span> <span class=""reference-text"">Steering for the 1995 ""<a href=""/wiki/History_of_autonomous_cars#1990s"" class=""mw-redirect"" title=""History of autonomous cars"">No Hands Across America </a>""...","</span> <span class=""reference-text"">Steering for the 1995 ""<a href=""/wiki/History_of_autonomous_cars#1990s"" class=""mw-redirect"" title=""History of autonomous cars"">No Hands Across America </a>"" re...",0,['other'],other,230
3,"\t </span> <span class=""reference-text""><cite class=""citation web""><a rel=""nofollow"" class=""external text"" href=""https://www.mileseducation.com/finance/artificial_intelligence"">""Miles Education | ...","</span> <span class=""reference-text""><cite class=""citation web""><a rel=""nofollow"" class=""external text"" href=""https://www.mileseducation.com/finance/artificial_intelligence"">""Miles Education | Fut...",0,['maybe_polyglot'],maybe_polyglot,392
4,"\t </span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">...","</span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">10....",0,['other'],other,419



=== Preview GitHub clean ===


Unnamed: 0,Sentence_clean,Payloads,Sentence_decoded,Label,families,len_after_clean,source,Class
0,<script>alert(document.cookie);</script>,http://www.nwce.gov.uk/search_process.php?keyword=%22%3e%3cscript%3ealert%28document.cookie%29%3b%3c<br>%2fscript%3e,"http://www.nwce.gov.uk/search_process.php?keyword=""><script>alert(document.cookie);</script>",1,['script_tag'],40,github,Malicious
1,<script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0,http://www.manchester.gov.uk/site/scripts/google_results.php?q=%22%3e%253cscript%3ealert%28document.<br>cookie%29%3b%253c%2fscript%3e&amp;btng=search&amp;ie=&amp;site=&amp;output=xml&amp;client=&a...,"http://www.manchester.gov.uk/site/scripts/google_results.php?q=""><script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0",1,['script_tag'],98,github,Malicious
2,<marquee>pappy</marquee>&missionary_id=69,http://www.ldsmissions.com/us/index.php?action=missionary.info%3cmarquee%3epappy%3c/marquee%3e&amp;missi<br>onary_id=69,http://www.ldsmissions.com/us/index.php?action=missionary.info<marquee>pappy</marquee>&missionary_id=69,1,['marquee_tag'],41,github,Malicious
3,<script>alert(document.cookie);</script>&subdwell=&dwelling=&streetnm=&locality=&hometown=&postcode=&datebrth=&learngen=&ethnicor=&tel_numb=&tel_mob=&email_add=&email_add2=&agree_info=&username=&p...,http://education.powys.gov.uk/english/adult_ed/register.php?lforenam=\\%22%3e%3cscript%3ealert(docume<br>nt.cookie);%3c/script%3e&amp;subdwell=&amp;dwelling=&amp;streetnm=&amp;locality=&amp;hometo...,"http://education.powys.gov.uk/english/adult_ed/register.php?lforenam=\\""><script>alert(document.cookie);</script>&subdwell=&dwelling=&streetnm=&locality=&hometown=&postcode=&datebrth=&learngen=&et...",1,['script_tag'],212,github,Malicious
4,<script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0,http://www.northwarks.gov.uk/site/scripts/google_results.php?q=%22%3e%253cscript%3ealert%28document.<br>cookie%29%3b%253c%2fscript%3e&amp;btng=search&amp;ie=&amp;site=&amp;output=xml&amp;client=&a...,"http://www.northwarks.gov.uk/site/scripts/google_results.php?q=""><script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0",1,['script_tag'],98,github,Malicious



=== Tipos de datos Kaggle ===
Sentence           object
Label               int64
families           object
len_chars           int64
Sentence_clean     object
len_after_clean     int64
families_str       object
dtype: object

=== Tipos de datos GitHub ===
Payloads             object
Class                object
len_chars             int64
IS_URL                 bool
HAS_XSS                bool
IS_CODE                bool
Sentence_decoded     object
len_decoded           int64
payload_extracted    object
families             object
Label                 int64
len_after_clean       int64
Sentence_clean       object
source               object
dtype: object


In [2]:
import ast
from collections import Counter

# =====================================================
# 1. Asegurar 'source' en Kaggle (GitHub ya la trae)
# =====================================================

if "source" not in df_kaggle.columns:
    df_kaggle["source"] = "kaggle"
else:
    df_kaggle["source"] = df_kaggle["source"].fillna("kaggle")

print("Valores únicos de 'source' en Kaggle:", df_kaggle["source"].unique())
print("Valores únicos de 'source' en GitHub:", df_github["source"].unique())


# =====================================================
# 2. Crear / normalizar families_str en GitHub
# =====================================================

def families_to_str(value):
    """
    Normaliza la columna 'families' que viene del CSV.
    - Si era lista → la convierte a 'a|b|c'
    - Si era string tipo "['script_tag']" → limpia y deja 'script_tag'
    - Si está vacía / NaN → 'other'
    """
    if isinstance(value, list):
        if not value:
            return "other"
        return "|".join(str(v) for v in value)

    s = str(value).strip()
    if s == "" or s.lower() == "nan":
        return "other"

    # Intentar interpretar como literal de lista: "['script_tag', 'img_tag']"
    if s.startswith("[") and s.endswith("]"):
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, list):
                if not parsed:
                    return "other"
                return "|".join(str(v) for v in parsed)
        except Exception:
            pass

        # Fallback: limpieza manual
        s_inner = s[1:-1]
        s_inner = s_inner.replace("'", "").replace('"', "").replace(" ", "")
        return s_inner if s_inner else "other"

    # Otro caso: ya es algo tipo 'script_tag' o 'a|b'
    return s

# Kaggle ya trae families_str
print("\n=== Preview families_str Kaggle (ya existente) ===")
display(df_kaggle[["families", "families_str"]].head(5))

# Crear / sobrescribir families_str en GitHub a partir de 'families'
df_github["families_str"] = df_github["families"].apply(families_to_str)

print("\n=== Preview families_str GitHub (recién construido) ===")
display(df_github[["families", "families_str"]].head(5))


# =====================================================
# 3. Distribución de familias por dataset
# =====================================================

print("\n=== Kaggle: Top 20 families_str ===")
print(df_kaggle["families_str"].value_counts().head(20))

print("\n=== GitHub: Top 20 families_str ===")
print(df_github["families_str"].value_counts().head(20))


# =====================================================
# 4. Familias vs Label (0/1)
# =====================================================

print("\n=== Kaggle: familias_str vs Label ===")
kaggle_fam_label = df_kaggle.groupby(["families_str", "Label"]).size().unstack(fill_value=0)
display(kaggle_fam_label.sort_values(by=1, ascending=False).head(20))

print("\n=== GitHub: familias_str vs Label ===")
github_fam_label = df_github.groupby(["families_str", "Label"]).size().unstack(fill_value=0)
display(github_fam_label.sort_values(by=1, ascending=False).head(20))


# =====================================================
# 5. ¿Sentence_clean tiene '<'? (HTML-ish vs URL / texto plano)
# =====================================================

df_github["has_angle"] = df_github["Sentence_clean"].astype(str).str.contains("<")

print("\n=== GitHub: conteo has_angle (Sentence_clean contiene '<') ===")
print(df_github["has_angle"].value_counts())

print("\n=== GitHub: has_angle vs Label ===")
display(df_github.groupby(["has_angle", "Label"]).size().unstack(fill_value=0))


# =====================================================
# 6. Ejemplos concretos de GitHub
# =====================================================

def show_examples(mask, title, n=5):
    print(f"\n=== Ejemplos: {title} (n={n}) ===")
    subset = df_github[mask].copy()
    if subset.empty:
        print("  (sin filas)")
        return
    cols = ["Sentence_clean", "Label", "families_str", "len_after_clean", "Class"]
    display(subset[cols].head(n))

# a) Candidatos fuertes a ruido: other + Label=0
mask_other_benign = (df_github["families_str"] == "other") & (df_github["Label"] == 0)
show_examples(mask_other_benign, "GitHub: families_str='other' & Label=0 (candidatos a URLs basura)")

# b) Ataques claros con script_tag
mask_script_mal = df_github["families_str"].str.contains("script_tag") & (df_github["Label"] == 1)
show_examples(mask_script_mal, "GitHub: families_str contiene 'script_tag' & Label=1")

# c) Casos con marquee_tag (defacement / variantes)
mask_marquee = df_github["families_str"].str.contains("marquee_tag")
show_examples(mask_marquee, "GitHub: families_str contiene 'marquee_tag'", n=5)

# d) Benignos sin '<' → típicamente URLs/texto plano
mask_no_angle_benign = (df_github["has_angle"] == False) & (df_github["Label"] == 0)
show_examples(mask_no_angle_benign, "GitHub: Label=0 & sin '<' (URLs / texto plano tipo joomla)", n=5)


Valores únicos de 'source' en Kaggle: ['kaggle']
Valores únicos de 'source' en GitHub: ['github']

=== Preview families_str Kaggle (ya existente) ===


Unnamed: 0,families,families_str
0,['image_tag'],image_tag
1,['event_handler'],event_handler
2,['other'],other
3,['maybe_polyglot'],maybe_polyglot
4,['other'],other



=== Preview families_str GitHub (recién construido) ===


Unnamed: 0,families,families_str
0,['script_tag'],script_tag
1,['script_tag'],script_tag
2,['marquee_tag'],marquee_tag
3,['script_tag'],script_tag
4,['script_tag'],script_tag



=== Kaggle: Top 20 families_str ===
event_handler                              6098
other                                      5354
maybe_polyglot                              729
event_handler|maybe_polyglot                699
event_handler|svg_tag                       191
event_handler|image_tag                      76
image_tag                                    63
script_tag                                   60
script_tag|event_handler                     54
event_handler|iframe_tag                     47
javascript_uri                               31
javascript_uri|image_tag                     10
event_handler|iframe_tag|maybe_polyglot       5
event_handler|svg_tag|maybe_polyglot          5
event_handler|javascript_uri                  3
iframe_tag                                    2
script_tag|event_handler|image_tag            1
script_tag|image_tag                          1
javascript_uri|maybe_polyglot                 1
event_handler|iframe_tag|svg_tag              1
Nam

Label,0,1
families_str,Unnamed: 1_level_1,Unnamed: 2_level_1
event_handler,7,6091
event_handler|maybe_polyglot,2,697
event_handler|svg_tag,0,191
other,5249,105
event_handler|image_tag,0,76
script_tag|event_handler,0,54
event_handler|iframe_tag,0,47
javascript_uri,0,31
script_tag,32,28
image_tag,51,12



=== GitHub: familias_str vs Label ===


Label,0,1
families_str,Unnamed: 1_level_1,Unnamed: 2_level_1
script_tag,0,9397
script_tag|marquee_tag|header_tag,0,1731
iframe_tag,0,432
script_tag|header_tag,0,266
img_tag|event_onerror,0,178
body_tag|event_onload,0,176
marquee_tag|header_tag,0,170
event_onmouseover,0,129
script_tag|text_container_tag,0,90
img_tag,0,73



=== GitHub: conteo has_angle (Sentence_clean contiene '<') ===
False    14735
True     13562
Name: has_angle, dtype: int64

=== GitHub: has_angle vs Label ===


Label,0,1
has_angle,Unnamed: 1_level_1,Unnamed: 2_level_1
False,14585,150
True,158,13404



=== Ejemplos: GitHub: families_str='other' & Label=0 (candidatos a URLs basura) (n=5) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,Class
38,"http://www.knoxcounty.org/search/sresults.php?search="");alert(document.cookie);//",0,other,81,Malicious
50,"<frame src=""javascript:alert('pappy was here');""></frameset>&subseq=y",0,other,69,Malicious
84,"http://bigcharts.marketwatch.com/symbollookup/symbollookupresults.asp?symb=""');alert(document.cookie);//&country=all&type=all",0,other,125,Malicious
98,"<>folder_id=2534374302031664&bmuid=1213393925797&adid=dsetdir&originalhostname="";alert(document.cookie);//&bmuid=1213393931143",0,other,126,Malicious
100,"<frameset><frame src=""javascript:alert('xss');""></frameset>",0,other,59,Malicious



=== Ejemplos: GitHub: families_str contiene 'script_tag' & Label=1 (n=5) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,Class
0,<script>alert(document.cookie);</script>,1,script_tag,40,Malicious
1,<script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0,1,script_tag,98,Malicious
3,<script>alert(document.cookie);</script>&subdwell=&dwelling=&streetnm=&locality=&hometown=&postcode=&datebrth=&learngen=&ethnicor=&tel_numb=&tel_mob=&email_add=&email_add2=&agree_info=&username=&p...,1,script_tag,212,Malicious
4,<script>alert(document.cookie);</script>&btng=search&ie=&site=&output=xml&client=&lr=&oe=&filter=0,1,script_tag,98,Malicious
6,<script>alert(document.cookie);</script>,1,script_tag,40,Malicious



=== Ejemplos: GitHub: families_str contiene 'marquee_tag' (n=5) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,Class
2,<marquee>pappy</marquee>&missionary_id=69,1,marquee_tag,41,Malicious
10,<marquee>pappy washere</marquee>,1,marquee_tag,32,Malicious
32,<marquee>pappy was here</marquee><!--<script>alert(document.cookie);</script>,1,script_tag|marquee_tag,77,Malicious
167,<marquee>testicles</marquee>,1,marquee_tag,28,Malicious
171,<marquee>testicles</marquee>,1,marquee_tag,28,Malicious



=== Ejemplos: GitHub: Label=0 & sin '<' (URLs / texto plano tipo joomla) (n=5) ===


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,Class
38,"http://www.knoxcounty.org/search/sresults.php?search="");alert(document.cookie);//",0,other,81,Malicious
84,"http://bigcharts.marketwatch.com/symbollookup/symbollookupresults.asp?symb=""');alert(document.cookie);//&country=all&type=all",0,other,125,Malicious
110,http://www.thegreene.com/tenant.cfm?tenantid=50&tenantspace=e9&tenantx=-1180&tenanty=-');alert('xss');//,0,other,104,Malicious
153,http://search.lifespan.org/search?p=q&ts=custom&s_2=1&w=');\nalert(document.cookie);//&x=0&y=0,0,other,93,Malicious
157,http://qantas.resultspage.com.au/search?p=q&ts=custom&w=');\nalert(document.cookie);//&x=0&y=0,0,other,93,Malicious


In [3]:
# =====================================
# 1. Asegurar columna has_angle
# =====================================

df_github["has_angle"] = df_github["Sentence_clean"].astype(str).str.contains("<")

print("=== has_angle counts ===")
print(df_github["has_angle"].value_counts())

# =====================================
# 2. Revisar distribución Label vs has_angle
# (confirmación adicional antes de filtrar)
# =====================================

print("\n=== Label vs has_angle ===")
display(df_github.groupby(["has_angle", "Label"]).size().unstack(fill_value=0))

# =====================================
# 3. Ver cuántos benignos con '<' (se conservarán)
# =====================================

mask_benign_html = (df_github["Label"] == 0) & (df_github["has_angle"] == True)

print("\n=== Benignos HTML (candidatos a conservar) ===")
print(mask_benign_html.sum())

display(df_github[mask_benign_html][["Sentence_clean","families_str","len_after_clean","Class"]].head(10))

# =====================================
# 4. Ver cuántos Label=0 sin '<' (candidatos a eliminar)
# =====================================

mask_benign_nohtml = (df_github["Label"] == 0) & (df_github["has_angle"] == False)

print("\n=== Benignos sin HTML (candidatos a eliminar) ===")
print(mask_benign_nohtml.sum())

display(df_github[mask_benign_nohtml][["Sentence_clean","families_str","len_after_clean","Class"]].head(10))


=== has_angle counts ===
False    14735
True     13562
Name: has_angle, dtype: int64

=== Label vs has_angle ===


Label,0,1
has_angle,Unnamed: 1_level_1,Unnamed: 2_level_1
False,14585,150
True,158,13404



=== Benignos HTML (candidatos a conservar) ===
158


Unnamed: 0,Sentence_clean,families_str,len_after_clean,Class
50,"<frame src=""javascript:alert('pappy was here');""></frameset>&subseq=y",other,69,Malicious
98,"<>folder_id=2534374302031664&bmuid=1213393925797&adid=dsetdir&originalhostname="";alert(document.cookie);//&bmuid=1213393931143",other,126,Malicious
100,"<frameset><frame src=""javascript:alert('xss');""></frameset>",other,59,Malicious
407,<br>www.raptr.us.tc</h1>,other,24,Malicious
494,<frame name=droite src=http://vuln.xssed.net/thirdparty/scripts/ckers.org.html>,other,79,Malicious
926,<a+href=javascript:alert('fuzz')>backslash+security+</a>&go.x=21&go.y=10&go=go,other,78,Malicious
1100,"<metahttp-equiv=""refresh"" content=""0;url=http://www.google.com/""> """"",other,68,Malicious
1122,"<metahttp-equiv=""refresh"" content=""0;url=http://www.cyber-warrior.org/israil/"">",other,79,Malicious
1854,</xss/*-*/style=xss:e/**/xpression(alert('=xssbydt='))>&targetrow=&noticeid=,other,76,Malicious
1855,</xss/*-*/style=xss:e/**/xpression(alert('=xssbydt='))>,other,55,Malicious



=== Benignos sin HTML (candidatos a eliminar) ===
14585


Unnamed: 0,Sentence_clean,families_str,len_after_clean,Class
38,"http://www.knoxcounty.org/search/sresults.php?search="");alert(document.cookie);//",other,81,Malicious
84,"http://bigcharts.marketwatch.com/symbollookup/symbollookupresults.asp?symb=""');alert(document.cookie);//&country=all&type=all",other,125,Malicious
110,http://www.thegreene.com/tenant.cfm?tenantid=50&tenantspace=e9&tenantx=-1180&tenanty=-');alert('xss');//,other,104,Malicious
153,http://search.lifespan.org/search?p=q&ts=custom&s_2=1&w=');\nalert(document.cookie);//&x=0&y=0,other,93,Malicious
157,http://qantas.resultspage.com.au/search?p=q&ts=custom&w=');\nalert(document.cookie);//&x=0&y=0,other,93,Malicious
159,http://ccc.resultspage.com/search?s_2=1&s_3=1&ts=custom&p=q&w=');\nalert(document.cookie);//&x=0&y=0,other,99,Malicious
199,"http://www.eetimes.com/techsearch/not_found.jhtml;jsessionid=fpyiswcussjkqqsndlpckh0cjunn2jvn?nftype=empty&querytext="";+alert(document.cookie);//&site_id=ee+times&_requestid=212632",other,180,Malicious
204,http://api.msappspace.com/proxy/relay.proxy?opensocial_authtype=signed&opensocial_token=at4ijkeddpnlkau5f/gsmgfqlodup0edptea45xouklxsvujcvgcams+blrbo8m3617chfi29vffwitjqsbfr3rlryrivuzjlk2v2b+fcua=...,other,228,Malicious
219,"http://fido.ca/web/content/planpromo/promo_q308_morebark&lang=""; alert(1); //",other,77,Malicious
235,"http://nrsweb.resultspage.com/display.php?p=q&ts=custom&w="");alert(document.cookie);//",other,86,Malicious


In [4]:
# =====================================
# 1. Definir filtro "usable por el sistema"
#    - nos quedamos con:
#      * todos los Label=1 (ataques)
#      * Label=0 que SÍ tienen '<' (HTML benigno)
# =====================================

mask_keep = (df_github["Label"] == 1) | (
    (df_github["Label"] == 0) & (df_github["has_angle"] == True)
)

df_github_filtered = df_github[mask_keep].copy()

print("=== SHAPES GitHub ===")
print("Original      :", df_github.shape)
print("Filtrado      :", df_github_filtered.shape)
print("Filas removidas (ruido no HTML):", df_github.shape[0] - df_github_filtered.shape[0])

# =====================================
# 2. Distribución de Label tras filtrado
# =====================================

print("\n=== Label en GitHub filtrado ===")
print(df_github_filtered["Label"].value_counts())

print("\n=== Top 15 families_str en GitHub filtrado ===")
print(df_github_filtered["families_str"].value_counts().head(15))

# =====================================
# 3. Ejemplos para verificar que lo que queda
#    sí tiene sentido para tu sistema
# =====================================

print("\n=== Ejemplos de ataques (Label=1) en GitHub filtrado ===")
display(
    df_github_filtered[df_github_filtered["Label"] == 1]
    .sample(5, random_state=1)[["Sentence_clean", "families_str", "len_after_clean", "Class"]]
)

print("\n=== Ejemplos de benignos HTML (Label=0) en GitHub filtrado ===")
benign_html = df_github_filtered[df_github_filtered["Label"] == 0]
print("Total benignos HTML:", benign_html.shape[0])
if benign_html.shape[0] > 0:
    display(
        benign_html
        .sample(min(5, benign_html.shape[0]), random_state=2)[
            ["Sentence_clean", "families_str", "len_after_clean", "Class"]
        ]
    )


=== SHAPES GitHub ===
Original      : (28297, 16)
Filtrado      : (13712, 16)
Filas removidas (ruido no HTML): 14585

=== Label en GitHub filtrado ===
1    13554
0      158
Name: Label, dtype: int64

=== Top 15 families_str en GitHub filtrado ===
script_tag                                      9397
script_tag|marquee_tag|header_tag               1731
iframe_tag                                       432
script_tag|header_tag                            266
img_tag|event_onerror                            178
body_tag|event_onload                            176
marquee_tag|header_tag                           170
other                                            158
event_onmouseover                                129
script_tag|text_container_tag                     90
img_tag                                           73
script_tag|iframe_tag                             70
marquee_tag|body_tag|header_tag|event_onload      64
header_tag                                        52
text_contai

Unnamed: 0,Sentence_clean,families_str,len_after_clean,Class
12703,"<script>alert(""maxwel"")</script>",script_tag,32,Malicious
10007,<script>alert(1)</script>&go=suchen,script_tag,35,Malicious
7266,<script \n\r>alert(document.cookie);</script>,script_tag,43,Malicious
3660,<script>alert(document.cookie)</script>><marquee><h1>backdoor</h1></marquee>,script_tag|marquee_tag|header_tag,76,Malicious
35,<script>alert(document.cookie);</script>,script_tag,40,Malicious



=== Ejemplos de benignos HTML (Label=0) en GitHub filtrado ===
Total benignos HTML: 158


Unnamed: 0,Sentence_clean,families_str,len_after_clean,Class
2265,<link href=http://starext.by.ru/css.css type=text/css rel=stylesheet>,other,69,Malicious
407,<br>www.raptr.us.tc</h1>,other,24,Malicious
28097,< sal; i++) { tmp = strarr[i].split(\\'=\\'); key = fixstr(tmp[0]); value = (tmp.length < 2) ? \\'\\' : fixstr(tmp[1]); while (key.charat(0) === \\' \\') { key = key.slice(1); } if (key.indexof(\\...,other,1452,Benign
1100,"<metahttp-equiv=""refresh"" content=""0;url=http://www.google.com/""> """"",other,68,Malicious
27961,"<meta http-equiv=""refresh"" content=""0; url=http://;url=javascript:alert('xss');"">",other,81,Malicious


In [6]:
# =====================================
# 1. Seleccionar columnas core (SIN 'families')
# =====================================

core_cols = [
    "Sentence_clean",
    "Label",
    "families_str",   # representación normalizada de la familia
    "len_after_clean",
    "source",
]

df_kaggle_core = df_kaggle[core_cols].copy()
df_github_core = df_github_filtered[core_cols].copy()

print("Shapes:")
print(" Kaggle core :", df_kaggle_core.shape)
print(" GitHub core :", df_github_core.shape)

# =====================================
# 2. Unir datasets limpios
# =====================================

df_combined = pd.concat(
    [df_kaggle_core, df_github_core],
    ignore_index=True
)

print("\nShape combinado final:", df_combined.shape)
print("Distribución final Label:")
print(df_combined["Label"].value_counts())

print("\nDistribución por source:")
print(df_combined["source"].value_counts())

# =====================================
# 3. Revisar duplicados exactos (por si hay 1:1 repetidos)
#    Usamos Sentence_clean + Label como llave de unicidad
# =====================================

dup_count = df_combined.duplicated(subset=["Sentence_clean", "Label"]).sum()
print("\nDuplicados exactos:", dup_count)

df_combined_nodup = df_combined.drop_duplicates(subset=["Sentence_clean", "Label"])

print("Shape sin duplicados:", df_combined_nodup.shape)

# =====================================
# 4. Guardar el dataset combinado final (solo con columnas útiles)
# =====================================

final_path = OUTPUT_DIR / "xss_combined_clean_final.csv"

df_combined_nodup.to_csv(
    final_path,
    index=False,
    quoting=1,   # csv.QUOTE_ALL
    escapechar="\\",
    encoding="utf-8"
)

print("\n=== Dataset final guardado en ===")
print(final_path)

print("\nPreview final:")
display(df_combined_nodup.head(10))


Shapes:
 Kaggle core : (13438, 5)
 GitHub core : (13712, 5)

Shape combinado final: (27150, 5)
Distribución final Label:
1    20927
0     6223
Name: Label, dtype: int64

Distribución por source:
github    13712
kaggle    13438
Name: source, dtype: int64

Duplicados exactos: 11799
Shape sin duplicados: (15351, 5)

=== Dataset final guardado en ===
d:\Archivos de Usuario\Documents\xss-cookie\notebooks\data\data_processed\xss_combined_clean_final.csv

Preview final:


Unnamed: 0,Sentence_clean,Label,families_str,len_after_clean,source
0,"<li><a href=""/wiki/File:Socrates.png"" class=""image""><img alt=""Socrates.png"" src=""//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png"" decoding=""async"" width=""18"" hei...",0,image_tag,557,kaggle
1,"<tt onmouseover=""alert(1)"">test</tt>",1,event_handler,36,kaggle
2,"</span> <span class=""reference-text"">Steering for the 1995 ""<a href=""/wiki/History_of_autonomous_cars#1990s"" class=""mw-redirect"" title=""History of autonomous cars"">No Hands Across America </a>"" re...",0,other,230,kaggle
3,"</span> <span class=""reference-text""><cite class=""citation web""><a rel=""nofollow"" class=""external text"" href=""https://www.mileseducation.com/finance/artificial_intelligence"">""Miles Education | Fut...",0,maybe_polyglot,392,kaggle
4,"</span>. <a href=""/wiki/Digital_object_identifier"" title=""Digital object identifier"">doi </a>:<a rel=""nofollow"" class=""external text"" href=""https://doi.org/10.1016%2FS0921-8890%2805%2980025-9"">10....",0,other,419,kaggle
5,"<li id=""cite_note-118""><span class=""mw-cite-backlink""><b><a href=""#cite_ref-118"">^ </a> </b>",0,other,92,kaggle
6,"<li><a href=""/wiki/Contextualism"" title=""Contextualism"">Contextualism </a> </li>",0,other,80,kaggle
7,"<li id=""cite_note-Representing_causation-95""><span class=""mw-cite-backlink"">^ <a href=""#cite_ref-Representing_causation_95-0""><sup><i><b>a </b> </i> </sup> </a> <a href=""#cite_ref-Representing_cau...",0,other,243,kaggle
8,"<tr><td class=""plainlist"" style=""padding:0 0.1em 0.4em"">",0,other,56,kaggle
9,</span>,0,other,7,kaggle
