In [17]:
# Importar librerías necesarias
import pandas as pd
import numpy as np

In [18]:
# Cargar el dataset original
df = pd.read_csv('credit_dataset_2_transaccion.csv')
print("Dataset cargado con éxito. Primeras filas:")
print(df.head())

Dataset cargado con éxito. Primeras filas:
   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   
4        2987004        0          86506            50.0         H   4497   

   card2  card3       card4  card5  ... V330  V331  V332  V333  V334 V335  \
0    NaN  150.0    discover  142.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
1  404.0  150.0  mastercard  102.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
2  490.0  150.0        visa  166.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
3  567.0  150.0  mastercard  117.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
4  514.0  150.0  mastercard  102.0  ...  0.0   0.0   0.0   0.0   0.0  0.0   

  V336  V337  V338  V339  
0  N

In [19]:
# Crear columnas necesarias para el cálculo de UID
df['uid_td_D1'] = np.floor(df['TransactionDT'] / (24 * 60 * 60))  # Calcular días desde TransactionDT

# Inicializar UID con TransactionID
df['uid'] = np.nan

# Lógica para calcular UID agrupando por 'card1' y 'uid_td_D1'
df['uid'] = df.groupby(['card1', 'uid_td_D1'])['TransactionID'].transform('min')

# Filtrar las columnas necesarias para exportar
df['uid'] = df.groupby(['card1', 'uid_td_D1'])['TransactionID'].transform('min')

print("Columna 'uid' generada con éxito. Primeras filas:")
print(df[['TransactionID', 'uid']].head())

Columna 'uid' generada con éxito. Primeras filas:
   TransactionID      uid
0        2987000  2987000
1        2987001  2987001
2        2987002  2987002
3        2987003  2987003
4        2987004  2987004


In [20]:
# Exportar el DataFrame con la nueva columna UID
df.to_csv('credit_dataset_2_transaccion_with_uid.csv', index=False)
print("Archivo exportado: credit_dataset_2_transaccion_with_uid.csv")

Archivo exportado: credit_dataset_2_transaccion_with_uid.csv


In [10]:
#VERSION MAS COMPLETA
# Código adaptado para generar UID a partir de:
# credit_dataset_2_transaccion.csv (debe estar en la misma carpeta)
# Basado en el script original (script generar uid.txt). :contentReference[oaicite:1]{index=1}

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# -------------------------
# 1) Lectura / carga
# -------------------------
def load_csv(path='credit_dataset_2_transaccion.csv', nrows=None, local_test=False):
    """
    Carga el CSV. Si local_test=True limita rows para pruebas rápidas.
    """
    print("Leyendo:", path)
    df = pd.read_csv(path, nrows=nrows)
    if local_test:
        df = df.head(10000)
        print("Modo local_test: tomando primeras", len(df), "filas")
    return df

# -------------------------
# 2) Preprocesamiento base (crea campos uid_td_*, DT_day, fixes)
# -------------------------
def preprocess_for_uid(df):
    """
    Crea columnas necesarias para la lógica de uid tal como en el notebook original:
      - uid_td_D1, uid_td_D2, uid_td_D3, uid_td_D5, uid_td_D10, uid_td_D11, uid_td_D15
      - DT_day
      - TransactionAmt_fix, V313_fix
      - lista v_cols (las Vx que tienen decimales y se usan como 'fix' en el original)
    Devuelve df modificado + lista v_cols detectadas.
    """
    df = df.copy()
    # full_addr
    if 'addr1' in df.columns and 'addr2' in df.columns:
        df['full_addr'] = df['addr1'].astype(str) + '_' + df['addr2'].astype(str)
    else:
        df['full_addr'] = np.nan

    # columnas D* usadas en el script original
    D_list = ['D1','D2','D3','D5','D10','D11','D15']
    for col in D_list:
        if col in df.columns:
            new_col = 'uid_td_' + col
            # cuidado con NaNs: si hay NaN en D*, lo dejamos NaN
            df[new_col] = np.nan
            mask_ok = df[col].notna() & df['TransactionDT'].notna()
            if mask_ok.any():
                # cálculo: floor(TransactionDT / (24*60*60) - D) + 1000
                df.loc[mask_ok, new_col] = np.floor(df.loc[mask_ok, 'TransactionDT']/(24*60*60) - df.loc[mask_ok, col]) + 1000
        else:
            df['uid_td_' + col] = np.nan

    # DT_day
    if 'TransactionDT' in df.columns:
        df['DT_day'] = np.floor(df['TransactionDT']/(24*60*60)) + 1000
    else:
        df['DT_day'] = np.nan

    # fixes
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_fix'] = df['TransactionAmt'].round(2)
    else:
        df['TransactionAmt_fix'] = 0.0

    if 'V313' in df.columns:
        df['V313_fix'] = df['V313'].round(2)
    else:
        df['V313_fix'] = 0.0

    # placeholder uid
    df['uid'] = np.nan

    # detectamos v_cols -> comportamiento parecido al script original:
    v_cols = []
    v_fix_cols = []
    for i in range(1,340):
        col = f'V{i}'
        if col not in df.columns:
            continue
        # si tiene décimales (no son enteros) lo marcamos como v_cols (omitiendo V313)
        col_vals = df[col].fillna(0)
        # si la resta de float-int no es 0 => hay fracción
        if ((col_vals - col_vals.astype(int)).abs().sum() != 0) and (col != 'V313'):
            v_cols.append(col)
            v_fix_cols.append(col + '_fix')
            # reproducimos la idea del script original: col_fix_ground y col_fix
            df[col + '_fix_ground'] = df[col].round(2)
            df[col + '_fix'] = df[col + '_fix_ground'] + df['TransactionAmt_fix']

    return df, v_cols

# -------------------------
# 3) Algoritmos principales para asignar uid (versión simplificada)
# -------------------------
def assign_single_items(df):
    """
    Marca items 'single' (solo una transacción por card1 + uid_td_D1),
    y les asigna uid = TransactionID (como en el script).
    Devuelve single_items (con uid) y df sin esos items (all_items).
    """
    df = df.copy()
    if not {'card1', 'uid_td_D1', 'TransactionID'}.issubset(df.columns):
        raise ValueError("Faltan columnas necesarias (card1, uid_td_D1, TransactionID)")

    df['count_tmp'] = df.groupby(['card1', 'uid_td_D1'])['TransactionID'].transform('count')
    single_items = df[df['count_tmp'] == 1].copy()
    if len(single_items) > 0:
        single_items['uid'] = single_items['TransactionID']

    all_items = df[~df['TransactionID'].isin(single_items['TransactionID'])].copy()
    # clean tmp col
    single_items.drop(columns=['count_tmp'], inplace=True, errors='ignore')
    all_items.drop(columns=['count_tmp'], inplace=True, errors='ignore')
    return single_items, all_items

def assign_first_appearances(all_items):
    """
    Para cada grupo (card1, uid_td_D1) toma la primera aparición (cumcount==0)
    y le asigna uid = TransactionID (como 'first_df' en el script).
    """
    df = all_items.copy()
    if not {'card1', 'uid_td_D1', 'TransactionID'}.issubset(df.columns):
        raise ValueError("Faltan columnas necesarias (card1, uid_td_D1, TransactionID)")
    df['cumc'] = df.groupby(['card1', 'uid_td_D1']).cumcount()
    first_df = df[df['cumc'] == 0].copy()
    if len(first_df) > 0:
        first_df['uid'] = first_df['TransactionID']
    first_df.drop(columns=['cumc'], inplace=True, errors='ignore')
    return first_df

def append_item_to_uid(nan_df, full_df, v_cols=None):
    """
    Versión secuencial y simplificada de append_item_to_uid del script:
    - Para cada item en nan_df intenta encontrar en full_df un grupo (mismo card1, uid_td_D1,
      TransactionID < item.TransactionID, DT_day <= item['uid_td_D3']+1)
    - Aplica checks simples de addr1/addr2 y (si v_cols provisto) alguna coincidencia en v_cols_fix
    - Si encuentra un único uid en df_masked lo asigna.
    Devuelve DataFrame con (TransactionID, uid) para los items asignados.
    """
    results = []
    # seguridad si v_cols None
    if v_cols is None:
        v_cols = []

    required = {'card1','uid_td_D1','TransactionID','uid_td_D3','DT_day'}
    missing = required - set(nan_df.columns) - set(full_df.columns)
    # No lanzar error; solo aviso
    # iterate
    for _, item in nan_df.iterrows():
        try:
            mask = (
                (full_df['card1'] == item['card1']) &
                (full_df['uid_td_D1'] == item['uid_td_D1']) &
                (full_df['TransactionID'] < item['TransactionID']) &
                (full_df['DT_day'] <= (item['uid_td_D3'] + 1))
            )
        except Exception:
            # si alguna columna faltó o es NaN, saltar item
            continue

        df_masked = full_df[mask].copy()
        if df_masked.empty:
            continue

        # addr checks: si item tiene addr2/addr1 no-null filtramos por igualdad o NaN en grupo
        for col in ['addr2','addr1']:
            if col in item.index and pd.notna(item[col]):
                df_masked = df_masked[(df_masked[col].isna()) | (df_masked[col] == item[col])]
                if df_masked.empty:
                    break

        if df_masked.empty:
            continue

        # Si v_cols provistas, chequeamos que al menos una de esas columnas coincida (flexible)
        v_match = False
        for col in v_cols:
            # usamos col+'_fix_ground' si existe, sino la columna original
            ground = col + '_fix_ground'
            if ground in item.index and ground in df_masked.columns and pd.notna(item[ground]):
                if (df_masked[ground] == item[ground]).any():
                    v_match = True
                    break
            elif col in item.index and col in df_masked.columns and pd.notna(item[col]):
                if (df_masked[col] == item[col]).any():
                    v_match = True
                    break

        # aceptamos si no hay v_cols (v_match irrelevant) o si hubo algun match
        if (len(v_cols) == 0) or v_match:
            unique_uids = df_masked['uid'].dropna().unique()
            if len(unique_uids) == 1:
                results.append((item['TransactionID'], unique_uids[0]))

    if len(results) == 0:
        return pd.DataFrame(columns=['TransactionID','uid'])
    return pd.DataFrame(results, columns=['TransactionID','uid'])

def find_multigroup(nan_df, full_df):
    """
    Encuentra posibles grupos multiple candidates (lista de uids) para cada item.
    Devuelve DataFrame TransactionID + multi_uid (lista).
    """
    out = []
    for _, item in nan_df.iterrows():
        mask = (
            (full_df['card1'] == item['card1']) &
            (full_df['uid_td_D1'] == item['uid_td_D1']) &
            (full_df['TransactionID'] < item['TransactionID']) &
            ((full_df['DT_day'] == item['uid_td_D3'] + 1) |
             (full_df['DT_day'] == item['uid_td_D3'] - 1) |
             (full_df['DT_day'] == item['uid_td_D3']))
        )
        df_masked = full_df[mask]
        if not df_masked.empty:
            out.append((item['TransactionID'], list(df_masked['uid'].dropna().unique())))
    if len(out) == 0:
        return pd.DataFrame(columns=['TransactionID','multi_uid'])
    tmp = pd.DataFrame(out, columns=['TransactionID','multi_uid'])
    return tmp

def find_right_uid(possible_groups, test_item, full_df, v_cols=None):
    """
    Dado un test_item y una lista de uids candidate, escoge el uid "más compatible".
    Implementa una puntuación simple basada en coincidencias de columnas (versión reducida).
    """
    if v_cols is None:
        v_cols = []

    features_weight = {
        'TransactionAmt': 2,
        'card2': 1, 'card3':1, 'card4':1, 'card5':1, 'card6':1,
        'uid_td_D2':2, 'uid_td_D10':2, 'uid_td_D11':2, 'uid_td_D15':2,
        'C14':1, 'addr1':1, 'addr2':1, 'P_emaildomain':1, 'V313_fix':1
    }

    scores = {}
    for g in possible_groups:
        masked = full_df[full_df['uid'] == g]
        if masked.empty:
            continue
        score = 0
        for col, w in features_weight.items():
            if col in test_item.index and col in masked.columns and pd.notna(test_item[col]):
                if test_item[col] in set(masked[col].dropna().values):
                    score += w
        # V columns check
        for col in v_cols:
            ground = col + '_fix_ground'
            if ground in test_item.index and ground in masked.columns and pd.notna(test_item[ground]):
                if test_item[ground] in set(masked[ground].dropna().values):
                    score += 1
        # almacenamos si el grupo pasa un check minimo
        scores[g] = score

    if len(scores) == 0:
        return np.nan
    # tomamos el uid con mejor score (si empate, toma el primero)
    best_uid = max(scores.items(), key=lambda x: x[1])[0]
    return best_uid

# -------------------------
# 4) Orquestador: genera uid para todo el df
# -------------------------
def generate_uids_from_df(df, v_cols=None, rounds_assign=5, verbose=True):
    """
    Orquesta todo el proceso: preprocesa, asigna single, first appearances,
    luego itera append_item_to_uid varias rondas, luego intenta multigroup resolution.
    Devuelve df con columna 'uid' (puede quedar NaN en algunos items).
    """
    # Precondiciones
    required_cols = ['TransactionID','card1','TransactionDT','D1']
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"Falta columna requerida: {c}")

    # Copias de trabajo
    work_df = df.copy().reset_index(drop=True)

    # preprocesado (si no corriste preprocess anteriormente)
    work_df, detected_v_cols = preprocess_for_uid(work_df)
    if v_cols is None:
        v_cols = detected_v_cols

    # bkp_items = copia para lookups
    bkp_items = work_df.copy()
    all_items = work_df.copy()

    # full_df: filas ya asignadas uid (inicialmente vacio)
    full_df = pd.DataFrame(columns=work_df.columns)

    # 1) single items
    single_items, all_items = assign_single_items(all_items)
    if verbose:
        print("Single transactions:", len(single_items))

    # 2) first appearances (one-first per (card1, uid_td_D1))
    first_df = assign_first_appearances(all_items)
    if verbose:
        print("First appearances assigned:", len(first_df))
    full_df = pd.concat([full_df, first_df]).sort_values(by='TransactionID').reset_index(drop=True)

    # iteraciones de asignacion por "match 1-to-1"
    for r in range(rounds_assign):
        nan_df = all_items[~all_items['TransactionID'].isin(full_df['TransactionID'])]
        if verbose:
            print(f"Ronda {r} - items por intentar asignar:", len(nan_df))
        if len(nan_df) == 0:
            break
        assigned = append_item_to_uid(nan_df, full_df, v_cols=v_cols)
        if assigned.empty:
            if verbose:
                print("No hay asignaciones en esta ronda.")
            break
        # map y append
        assigned_map = dict(zip(assigned['TransactionID'], assigned['uid']))
        nan_df.loc[nan_df['TransactionID'].isin(assigned['TransactionID']), 'uid'] = nan_df.loc[nan_df['TransactionID'].isin(assigned['TransactionID']), 'TransactionID'].map(assigned_map)
        # en el script original asignaban el uid encontrado (no uid=TransactionID), así que mapeamos correctamente:
        # Necesitamos extraer uid desde assigned
        nan_df.loc[nan_df['TransactionID'].isin(assigned['TransactionID']), 'uid'] = nan_df.loc[nan_df['TransactionID'].isin(assigned['TransactionID']), 'TransactionID'].map(assigned_map)
        # corregimos: assigned_map ya tiene uid en assigned['uid']
        # append to full_df
        to_append = all_items[all_items['TransactionID'].isin(assigned['TransactionID'])].copy()
        # sustituimos uid por la encontrada
        for tid, uid in zip(assigned['TransactionID'], assigned['uid']):
            to_append.loc[to_append['TransactionID']==tid, 'uid'] = uid
        full_df = pd.concat([full_df, to_append]).sort_values(by='TransactionID').reset_index(drop=True)
        if verbose:
            print("Assigned items this round:", len(to_append))

    # Multigroup resolution (items that aún no están en full_df)
    remaining = all_items[~all_items['TransactionID'].isin(full_df['TransactionID'])]
    if verbose:
        print("Items remaining antes de multigroup:", len(remaining))
    if len(remaining) > 0:
        # encontrar listas de candidate groups
        mg = find_multigroup(remaining, full_df)
        if not mg.empty:
            # resolver group por group (secuencial)
            resolved = []
            for _, row in mg.iterrows():
                test_id = row['TransactionID']
                possible = row['multi_uid']
                test_item = all_items[all_items['TransactionID']==test_id].iloc[0]
                chosen = find_right_uid(possible, test_item, full_df, v_cols=v_cols)
                if pd.notna(chosen):
                    resolved.append((test_id, chosen))
            if len(resolved) > 0:
                tmp = pd.DataFrame(resolved, columns=['TransactionID','uid'])
                # anadir los resueltos a full_df
                to_append = all_items[all_items['TransactionID'].isin(tmp['TransactionID'])].copy()
                for tid, uid in zip(tmp['TransactionID'], tmp['uid']):
                    to_append.loc[to_append['TransactionID']==tid, 'uid'] = uid
                full_df = pd.concat([full_df, to_append]).sort_values(by='TransactionID').reset_index(drop=True)
                if verbose:
                    print("Multigroup resolved:", len(to_append))

    # Combine final: full_df (asignados), single_items, y los no asignados (opcionales)
    final = pd.concat([full_df, single_items]).sort_values(by='TransactionID').reset_index(drop=True)
    # opcional: incluir todos los items sin uid (marcarlos NaN) si quieres el DF completo:
    merged = df.merge(final[['TransactionID','uid']], on='TransactionID', how='left', suffixes=('','_uid_assigned'))
    # si ya existía uid en df la preferimos; sino usamos uid_assigned:
    merged['uid'] = merged['uid'].fillna(merged.get('uid_assigned'))
    merged.drop(columns=['uid_assigned'], inplace=True, errors='ignore')

    if verbose:
        assigned_total = merged['uid'].notna().sum()
        print("UIDs asignados (total):", assigned_total, "/", len(merged))
    return merged

# -------------------------
# 5) Export helper
# -------------------------
def export_uids_df(df_with_uid, output_path='uids_full_from_csv.csv'):
    """
    Exporta TransactionID + uid
    """
    out = df_with_uid[['TransactionID','uid']].copy()
    out.to_csv(output_path, index=False)
    print("Exportado:", output_path)
    return output_path

# -------------------------
# 6) FUNCION PRINCIPAL - uso directo
# -------------------------
def run_pipeline(csv_in='credit_dataset_2_transaccion.csv',
                 csv_out='uids_full_from_csv.csv',
                 local_test=False,
                 rounds_assign=5,
                 verbose=True):
    df = load_csv(csv_in, local_test=local_test)
    df_pre, v_cols = preprocess_for_uid(df)
    result = generate_uids_from_df(df_pre, v_cols=v_cols, rounds_assign=rounds_assign, verbose=verbose)
    export_path = export_uids_df(result, output_path=csv_out)
    return result, export_path

# =====================================================================
# USO:
# - Pegar este bloque en una celda de Jupyter (mismo folder que tu CSV)
# - Ejecutar:
#     result_df, path = run_pipeline(local_test=False, rounds_assign=5, csv_in='credit_dataset_2_transaccion.csv')
# - El CSV producido se llamará por defecto 'uids_full_from_csv.csv'
# =====================================================================


In [14]:
import numpy as np
import pandas as pd

def load_csv_light(path='credit_dataset_2_transaccion.csv'):
    print("Leyendo:", path)
    df = pd.read_csv(path)
    return df

def generate_uid_as_tid(df):
    """
    Genera un UID usando card1, addr1, D1n pero en vez de concatenar,
    asigna como uid el TransactionID de la primera transacción del grupo.
    """
    df = df.copy()
    # Crear D1n
    df['D1n'] = np.floor(df['TransactionDT'] / (24*60*60)) - df['D1']
    
    # Crear clave de agrupación
    group_key = (
        df['card1'].astype(str) + '_' +
        df['addr1'].astype(str) + '_' +
        df['D1n'].astype(str)
    )
    
    # Para cada grupo, tomar el TransactionID mínimo como uid
    uid_map = df.groupby(group_key)['TransactionID'].transform('min')
    df['uid'] = uid_map
    
    return df

def export_uid(df, path='uids_light_tid.csv'):
    df[['TransactionID','uid']].to_csv(path, index=False)
    print("Exportado:", path)
    return path

def run_pipeline_light_tid(csv_in='credit_dataset_2_transaccion.csv',
                           csv_out='uids_light_tid.csv'):
    df = load_csv_light(csv_in)
    df_uid = generate_uid_as_tid(df)
    export_path = export_uid(df_uid, csv_out)
    return df_uid, export_path

# Uso:
# result_df, path = run_pipeline_light_tid()


In [15]:
result_df, path = run_pipeline_light_tid()

Leyendo: credit_dataset_2_transaccion.csv
Exportado: uids_light_tid.csv
