In [1]:
import pandas as pd
df = pd.read_csv('phishtank_es.csv', encoding='latin-1')
print(df.head())


   phish_id                                                url  \
0   9152757  https://interiakastliveresvbrokers.com/?token=...   
1   9152752                    http://accessrevoke1.vercel.app   
2   9152753                  https://accessrevoke1.vercel.app/   
3   9152737       https://interiakastliverescbrokers.com/login   
4   9152736  https://interiakastliverescbrokers.com/?token=...   

                                    phish_detail_url  \
0  http://www.phishtank.com/phish_detail.php?phis...   
1  http://www.phishtank.com/phish_detail.php?phis...   
2  http://www.phishtank.com/phish_detail.php?phis...   
3  http://www.phishtank.com/phish_detail.php?phis...   
4  http://www.phishtank.com/phish_detail.php?phis...   

             submission_time verified          verification_time online target  
0  2025-07-11T09:46:33+00:00      yes  2025-07-11T09:52:02+00:00    yes  Other  
1  2025-07-11T09:09:45+00:00      yes  2025-07-11T09:21:34+00:00    yes  Other  
2  2025-07-11T09:09:45+

In [2]:
es_urls = df[df['url'].str.contains('.es', case = False, na = False)]
print(f"Número de URLs .es: {len(es_urls)}")
print(es_urls[['url', 'target']].head())

Número de URLs .es: 12666
                                                 url target
0  https://interiakastliveresvbrokers.com/?token=...  Other
1                    http://accessrevoke1.vercel.app  Other
2                  https://accessrevoke1.vercel.app/  Other
3       https://interiakastliverescbrokers.com/login  Other
4  https://interiakastliverescbrokers.com/?token=...  Other


In [3]:
import re

In [4]:
def es_dominio_es(url):
    m = re.match(r'https?://([^/]+)/?', url)
    if m:
        dominio = m.group(1)
        return dominio.lower().endswith('.es')
    return False

In [8]:
df['es_dominio_es'] = df['url'].apply(es_dominio_es)
solo_es = df[df['es_dominio_es']]
print(f"Dominios .es reales: {len(solo_es)}")
print(solo_es[['url', 'target']])


Dominios .es reales: 41
                                                     url  \
369               https://lr4.qkipikpp.es/@R7IdcUm25R12/   
395         https://muqdns84x.hrcbods.es/6U7CY6ih!5ulyi/   
396       https://muqd22ns84x.hrcbods.es/6U7CY6ih!5ulyi/   
397      https://muqd223ns84x.hrcbods.es/6U7CY6ih!5ulyi/   
407    https://w7l3d.cclaccmg.es/no9DGJ!tq84Oi0HKQ8/*...   
451    https://gtwky.caniqqm.es/f8qrRCRAr6P!lhN5Qx/*c...   
507    https://consultapsicologo.es/Protect/Sites/ind...   
713     http://enhd.es/smart/?&amp;id=phishing@d3lab.net   
773        https://asaderorondasur.es/?validtrue=a@b.com   
928                          https://helpsecureme.com.es   
947            https://ait.ptootlj.es/Y!xvw0MY9SQvFFgE8/   
1026                     https://infotecnikas.es/re.html   
1215            https://9vbeh.tujaeiq.es/4z@OXY7igtOd8V/   
1299   https://towsl.msyoxvxe.es/p5EtIv0OSrv!7/$YWRya...   
1321   https://beneficiobrad-220525080321.braprograma...   
1477   https://v

In [10]:
empresas_es = [
    'caixabank', 'bbva', 'santander', 'cajamar', 'sabadel', 'unicaja',
    'bankia', 'liberbank', 'ibercaja', 'openbank', 'abanca', 'banco popular',
    'correos', 'aeat', 'sepe', 'seg-social', 'dgt', 'mapfre', 'renfe', 'movistar', 
    'orange', 'jazztel', 'endesa', 'iberdrola', 'sum', 'adif', 'osakidetza', 'cita previa'
]


In [12]:
mask_empresas = df['url'].str.contains('|'.join(empresas_es), case = False, na = False)
empresas_urls = df[mask_empresas]
print(f"Número de URLs con empresas españolas: {len(empresas_urls)}")
print(empresas_urls[['url', 'target']])

Número de URLs con empresas españolas: 218
                                                     url  target
287    https://docs.google.com/presentation/d/e/2PACX...   Other
632                https://orangeinfos.godaddysites.com/   Other
663    https://sumup.ricas.eu/public/pages/?d=en&p=index   Other
664    https://sumup.ricas.eu/public/pages/?d=en&amp;...   Other
811    https://u53661358.ct.sendgrid.net/ls/click?upn...   Other
...                                                  ...     ...
12670                 https://orange65.godaddysites.com/  Orange
12671        https://messagerieorange5.godaddysites.com/   Other
12672                 https://orange69.godaddysites.com/   Other
12673                 https://orange14.godaddysites.com/   Other
12674            https://accueilorange.godaddysites.com/   Other

[218 rows x 2 columns]


In [13]:
for entidad in empresas_es:
    n = df['url'].str.contains(entidad, case = False, na = False).sum()
    if n > 0:
        print(f'{entidad}: {n} URLs')

caixabank: 2 URLs
bbva: 6 URLs
santander: 2 URLs
correos: 18 URLs
sepe: 3 URLs
dgt: 40 URLs
orange: 76 URLs
sum: 70 URLs
adif: 1 URLs


In [15]:
for entidad in ['orange', 'sum', 'dgt', 'correos']:
    print(f'\n{entidad.upper()}:')
    subset = df[df['url'].str.contains(entidad, case=False, na=False)]
    print(subset[['url', 'target']].head(10))



ORANGE:
                                                    url  target
632               https://orangeinfos.godaddysites.com/   Other
833     https://messagerie-orange-pro.mystrikingly.com/  Orange
951           https://service-orange1.godaddysites.com/  Orange
1229        https://vocalorangeau0707.godaddysites.com/  Orange
1235       https://orange-connexion38.godaddysites.com/  Orange
1289        https://orange-connexion7.godaddysites.com/  Orange
1545  https://votrelignetelephoniqueorange.godaddysi...  Orange
1768             https://orange-mail9.godaddysites.com/   Other
1770             https://orange_mail1.godaddysites.com/   Other
1776                https://smsorange.godaddysites.com/  Orange

SUM:
                                                    url target
287   https://docs.google.com/presentation/d/e/2PACX...  Other
663   https://sumup.ricas.eu/public/pages/?d=en&p=index  Other
664   https://sumup.ricas.eu/public/pages/?d=en&amp;...  Other
811   https://u53661358.ct.se