##Paqueterias

In [None]:
import pandas as pd
import numpy as np
import zipfile
from google.colab import drive

# Librerias auxiliares
import unicodedata
from sklearn.model_selection import train_test_split
import re

##Funciones

In [None]:
def completitud(df):
    comple=pd.DataFrame(df.isnull().sum())
    comple.reset_index(inplace=True)
    comple=comple.rename(columns={"index":"variable",0:"total"})
    comple["completitud"]=(1-comple["total"]/df.shape[0])*100
    comple=comple.sort_values(by="completitud",ascending=True)
    comple.reset_index(drop=True,inplace=True)
    return comple

def clean_text(text, pattern="[^a-zA-Z0-9]"):
    cleaned_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern, " ", cleaned_text.decode("utf-8"), flags=re.UNICODE)
    cleaned_text = u' '.join(cleaned_text.lower().lstrip().strip().split())
    return cleaned_text

def clean_cat(var):
    cleaned_cat=" ".join(str(var).lower().lstrip().split())
    return cleaned_cat

def replace_nan(df, col, nan):
    df[col]=df[col].map(lambda x: np.nan if x==nan else x)
    
def convert_to_nan(df,var):
    var_index=list(df[df[var].map(lambda x:str(x).isdigit())][var].index)+list(df[~df[var].map(lambda x:any([str(y).isalpha() for y in str(x)]))].index)
    df.loc[var_index, var]=np.nan

def is_digit(df,col, flag):
    if flag==1:
        digit=df[df[col].map(lambda x:str(x).isdigit())]
    elif flag==0:
        digit=df[~df[col].map(lambda x:str(x).isdigit())]
    else:
        print("Valor no admitido")
    return digit

        
def is_alpha(df,col, flag):
    if flag==1:
        alpha=df[df[col].map(lambda x:any([str(y).isalpha() for y in str(x)]))]
    elif flag==0:
        alpha=df[~df[col].map(lambda x:any([str(y).isalpha() for y in str(x)]))]
    else:
        print("Valor no admitido")
    return alpha

def remover(lista, elementos):
    for i in elementos:
        lista.remove(i)

def freq(df:pd.DataFrame,var:list):
    
    if type(var)!=list:
        var = [var]
    for v in var:
        aux = df[v].value_counts().to_frame().sort_index()
        aux.columns = ['FA']
        aux['FR'] = aux['FA']/aux['FA'].sum()
        aux[['FAA','FRA']] = aux.cumsum()
        print(f'****Tabla de frecuencias  {v}  ***\n\n')
        print(aux)
        print("\n"*3)
        
    result=pd.DataFrame(df[col].value_counts(1))
    if result.shape[0]>0:
        if (result[col].values[0]>.91) :
            print(f"{col} -- VARIABLE UNITARIA")
        result[col]=result[col].map(lambda x:str(round(x*100,2))+"%")
        result.reset_index(inplace=True)
        result.columns=[col+"_valores","%_aparicion"]
    return result


def imputar(df, col,strategy):
    imp=SimpleImputer(missing_values=np.nan,strategy=strategy)
    imp.fit(df[[col]])
    df.reset_index(drop=True,inplace=True)
    df[col]=imp.transform(df[[col]])   
    

def unitarias(df,col):
    result=pd.DataFrame(df[col].value_counts(1))
    if result.shape[0]>0:
        if (result[col].values[0]>.91) :
            print(f"{col} -- VARIABLE UNITARIA")
            
def unitarias_per(df,col):
    result=pd.DataFrame(df[col].value_counts(1))
    if result.shape[0]>0:
        if (result[col].values[0]>.91) :
            print(f"{col} -- VARIABLE UNITARIA")
        result[col]=result[col].map(lambda x:str(round(x*100,2))+"%")
        result.reset_index(inplace=True)
        result.columns=[col+"_valores","%_aparicion"]
    return result


def ssample(df, var, n_sample):
    df_complemento,df_sample=train_test_split(df,test_size=n_sample,stratify=df[var],random_state=3)   
    return df_sample

def normalizar(df:pd.DataFrame,var:str,umbral:float=0.05)->tuple:
    """Esta función normaliza una variable discreta basada en el 
    principio de umbral de representatividad estadística.

    Args:
        df (pd.DataFrame): datos con v.d. a normalizar
        var (str): nombre de la variable
        umbral (float, optional): umbral estadístico deseado. Defaults to 0.05.

    Returns:
        tuple: nombre de la variable y mapa de normalización
    """
    aux = df[var].value_counts(1).to_frame()
    aux['map'] = np.where(aux[var]<umbral,'Otros',aux.index)
    if aux.loc[aux['map']=='Otros'][var].sum()<umbral:
        aux['map'].replace({'Otros':aux.head(1)['map'].values[0]},inplace=True)
    aux.drop(var,axis=1,inplace=True)
    return var,aux['map'].to_dict()

##Extraccion y lectura de datos

In [None]:



!pip install wget
!wget https://github.com/Dereck125/archivo/raw/master/Credit_Card_Fraud_Detection.csv.xls

path = "/content/drive/MyDrive/Credit_Card_Fraud_Detection.csv"

df=pd.read_csv(path)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Archive:  /content/drive/MyDrive/Credit_Card_Fraud_Detection.csv.zip
replace /content/drive/My Drive/Credit_Card_Fraud_Detection.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/My Drive/Credit_Card_Fraud_Detection.csv  
replace /content/drive/My Drive/__MACOSX/._Credit_Card_Fraud_Detection.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/My Drive/__MACOSX/._Credit_Card_Fraud_Detection.csv  


##Exploracion de datos

In [None]:
df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use_Chip,Merchant_Name,Merchant_City,...,City,State,Zipcode,Latitude,Longitude,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,Num_Credit_Cards
0,1107,2,2020,2,17,16:36,$0.70,Swipe Transaction,-6571010470072147219,Milton,...,Milton,PA,17847,41.0,-76.85,$16935,$34531,$60156,703,4
1,1096,3,2001,7,29,21:00,$31.80,Swipe Transaction,-1288082279022882052,Lebanon,...,Lebanon,OR,97355,44.52,-122.81,$17003,$34671,$55643,786,6
2,580,0,2012,11,1,18:17,$25.09,Swipe Transaction,6826708177432339862,Adrian,...,Adrian,MI,49221,41.89,-84.04,$17144,$34954,$5942,729,3
3,1310,2,2018,1,19,07:37,$4.79,Chip Transaction,4722913068560264812,White Hall,...,Oakdale,LA,71463,30.81,-92.65,$16816,$34287,$77180,760,3
4,1126,3,2011,12,4,09:22,$67.62,Swipe Transaction,7641585028463831554,Baltimore,...,Gwynn Oak,MD,21207,39.32,-76.72,$20641,$24055,$0,733,7


In [None]:
# Dimensión  de la base de datos

print(f"Número de filas: { df.shape[0]}")
print(f"Número de columnas: { df.shape[1]}")

Número de filas: 975476
Número de columnas: 44


In [None]:
# Tipo de dato
df.dtypes

User                             int64
Card                             int64
Year                             int64
Month                            int64
Day                              int64
Time                            object
Amount                          object
Use_Chip                        object
Merchant_Name                    int64
Merchant_City                   object
Merchant_State                  object
Zip                            float64
MCC                              int64
Errors?                         object
tgt                             object
Card_Brand                      object
Card_Type                       object
Card_Number                      int64
Expires                         object
CVV                              int64
Has_Chip                        object
Cards_Issued                     int64
Credit_Limit                    object
Acct_Open_Date                  object
Year_PIN_last_Changed            int64
Card_on_Dark_Web         

In [None]:
# Resumen de variables cualitatitvas

df.describe(include=object)

Unnamed: 0,Time,Amount,Use_Chip,Merchant_City,Merchant_State,Errors?,tgt,Card_Brand,Card_Type,Expires,...,Acct_Open_Date,Card_on_Dark_Web,Person,Gender,Address,City,State,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt
count,975476,975476,975476,975476,866574,15403,975476,975476,975476,975476,...,975476,975476,975476,975476,975476,975476,975476,975476,975476,975476
unique,1440,33894,3,9417,173,18,2,4,3,259,...,303,1,1976,2,1982,1280,51,1741,1931,1865
top,13:06,$80.00,Swipe Transaction,ONLINE,CA,Insufficient Balance,No,Mastercard,Debit,02/2020,...,02/2010,No,Beckett Gonzalez,Female,702 Elm Drive,Houston,CA,$0,$34496,$0
freq,1281,10170,615365,108902,103642,9714,974286,523136,601931,29429,...,12097,975476,3337,502124,3337,11884,118411,5334,3337,47467


##Muestra

In [None]:
df_sample=df.sample(1000)

In [None]:
df_sample.head(10)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use_Chip,Merchant_Name,Merchant_City,...,City,State,Zipcode,Latitude,Longitude,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,Num_Credit_Cards
91698,1029,2,2003,6,15,15:55,$178.18,Swipe Transaction,483490033258680568,Los Angeles,...,Atlanta,TX,75551,33.11,-94.16,$16746,$34143,$49839,648,4
595634,1120,0,2018,10,27,13:11,$-88.00,Chip Transaction,1799189980464955940,West Union,...,Postville,IA,52162,43.08,-91.56,$13332,$27183,$58323,637,1
411141,1824,0,2016,6,23,16:36,$53.29,Chip Transaction,-1046622217034093949,Gheens,...,Donaldsonville,LA,70346,30.09,-90.99,$16474,$27150,$2007,737,1
728062,775,1,2019,12,9,22:30,$31.83,Chip Transaction,-4500542936415012428,Janesville,...,Janesville,WI,53546,42.68,-89.01,$21278,$43386,$2192,698,3
365431,1334,1,2002,4,26,08:43,$12.18,Online Transaction,-2042049018365856408,ONLINE,...,Lakeville,MN,55044,44.67,-93.24,$34186,$57824,$31354,668,5
546715,1434,0,2018,3,3,03:09,$9.90,Chip Transaction,-5467922351692495955,Garland,...,Garland,TX,75040,32.91,-96.62,$18408,$37536,$63386,717,3
239725,805,2,2004,9,7,07:40,$52.98,Swipe Transaction,1799189980464955940,Philadelphia,...,Philadelphia,PA,19145,39.95,-75.16,$18007,$36715,$74022,547,4
625323,59,1,2017,1,28,11:50,$84.92,Online Transaction,4241336128694185533,ONLINE,...,Mount Union,PA,17066,40.38,-77.88,$14650,$29864,$44902,717,2
831790,1490,5,2007,12,14,06:24,$1.31,Swipe Transaction,6098563624419731578,Tacoma,...,Tacoma,WA,98404,47.2,-122.4,$16941,$11410,$0,812,7
482364,910,1,2011,1,6,14:56,$100.00,Swipe Transaction,-4282466774399734331,Newtown,...,Colmar,PA,18915,40.27,-75.26,$26545,$54124,$0,727,2


In [None]:
df.columns


Index(['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use_Chip',
       'Merchant_Name', 'Merchant_City', 'Merchant_State', 'Zip', 'MCC',
       'Errors?', 'tgt', 'Card_Brand', 'Card_Type', 'Card_Number', 'Expires',
       'CVV', 'Has_Chip', 'Cards_Issued', 'Credit_Limit', 'Acct_Open_Date',
       'Year_PIN_last_Changed', 'Card_on_Dark_Web', 'Person', 'Current_Age',
       'Retirement_Age', 'Birth_Year', 'Birth_Month', 'Gender', 'Address',
       'Apartment', 'City', 'State', 'Zipcode', 'Latitude', 'Longitude',
       'Per_Capita_Income_-_Zipcode', 'Yearly_Income_-_Person', 'Total_Debt',
       'FICO_Score', 'Num_Credit_Cards'],
      dtype='object')

##Etiquetado de variables

In [None]:
# Identificador

varid=['User']

# Variables cuantitativas
varc=['Card', 'Cards_Issued','Credit_Limit','Current_Age','Retirement_Age','Per_Capita_Income_-_Zipcode',
      'Yearly_Income_-_Person','Total_Debt','FICO_Score','Num_Credit_Cards','Amount']

# Variables cualitativas: categoricas
vard= ['Card_Brand', 'Card_Type', 'Has_Chip','Card_on_Dark_Web','Gender','City','State',
       'Zipcode','Use_Chip','Merchant_City','Merchant_State','Zip','MCC','Errors?','tgt']
      
# Variables tipo texto
vartxt=['Person','Address','Apartment','Card_Number','CVV','Month','Day','Year',
        'Time','Merchant_Name','Latitude','Longitude','Year_PIN_last_Changed','Birth_Year','Birth_Month' ]

# Variable tipo fecha
varf=['Expires','Acct_Open_Date']


In [None]:
df[varid+vartxt+vard+varc+varf].shape[1]

44

In [None]:
df=df[varid+varc+vard+vartxt+varf].copy()

In [None]:
df.head(10)

Unnamed: 0,User,Card,Cards_Issued,Credit_Limit,Current_Age,Retirement_Age,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,...,Year,Time,Merchant_Name,Latitude,Longitude,Year_PIN_last_Changed,Birth_Year,Birth_Month,Expires,Acct_Open_Date
0,1107,2,2,$8600,32,65,$16935,$34531,$60156,703,...,2020,16:36,-6571010470072147219,41.0,-76.85,2013,1987,11,10/2020,09/2013
1,1096,3,1,$57,50,64,$17003,$34671,$55643,786,...,2001,21:00,-1288082279022882052,44.52,-122.81,2017,1970,2,12/2020,10/1999
2,580,0,1,$14231,67,71,$17144,$34954,$5942,729,...,2012,18:17,6826708177432339862,41.89,-84.04,2010,1952,10,06/2023,10/2002
3,1310,2,2,$13350,48,66,$16816,$34287,$77180,760,...,2018,07:37,4722913068560264812,30.81,-92.65,2011,1971,9,06/2020,02/2006
4,1126,3,2,$14600,84,69,$20641,$24055,$0,733,...,2011,09:22,7641585028463831554,39.32,-76.72,2013,1935,8,10/2021,04/2004
5,1203,2,1,$26277,29,66,$26545,$54122,$166903,796,...,2014,19:24,1108327803852946055,42.03,-88.08,2010,1991,1,07/2023,02/2010
6,1900,1,2,$1200,65,70,$12387,$25257,$83995,734,...,2014,12:53,-4460013545355022869,33.36,-81.28,2010,1954,5,05/2024,11/2010
7,1123,1,1,$7700,51,67,$16823,$34302,$84935,651,...,2016,07:21,-1688244360627004732,31.84,-106.43,2009,1968,12,07/2016,09/2009
8,931,3,1,$41,35,66,$27637,$56350,$120125,808,...,2009,14:15,5743693703746425125,35.82,-87.01,2010,1984,4,04/2024,01/2006
9,1033,1,2,$19670,37,71,$60593,$123540,$236393,764,...,2005,09:38,1913477460590765860,38.9,-77.26,2009,1982,3,01/2024,01/2005


##Calidad de datos

###Orden

In [None]:
df.dtypes

User                             int64
Card                             int64
Cards_Issued                     int64
Credit_Limit                    object
Current_Age                      int64
Retirement_Age                   int64
Per_Capita_Income_-_Zipcode     object
Yearly_Income_-_Person          object
Total_Debt                      object
FICO_Score                       int64
Num_Credit_Cards                 int64
Amount                          object
Card_Brand                      object
Card_Type                       object
Has_Chip                        object
Card_on_Dark_Web                object
Gender                          object
City                            object
State                           object
Zipcode                          int64
Use_Chip                        object
Merchant_City                   object
Merchant_State                  object
Zip                            float64
MCC                              int64
Errors?                  

In [None]:
# Se convierte el tipo de dato de cada variable de acuerdo a su naturaleza

# Variables categoricas
for v in vard:
    df[v]=df[v].astype(str)

# Variables cuantitativas
for v in varc:
    df[v].apply(pd.to_numeric, errors='coerce')
    
# Variables tipo texto
for v in vartxt:
    df[v]=df[v].astype(str)
    
# Variables tipo fecha
for v in varf:
    df[v]=pd.to_datetime(df[v])
    

Hay variables como credit limit, per capita income y yeraly income, que no se pasaron a un tipo de dato numerico por el simbolo $, es por eso que se lo eliminare para trabajar mejor los datos que son cuantitativos.

In [None]:
lista_aux = ['Credit_Limit','Per_Capita_Income_-_Zipcode','Yearly_Income_-_Person','Total_Debt','Amount']

for i in lista_aux:
  df[i] = df[i].apply(lambda x: x.replace("$",""))

In [None]:
for v in lista_aux:
    df[v]=df[v].astype(str).astype(float)

# Variables cuantitativas faltantes
for v in lista_aux:
    df[v].apply(pd.to_numeric, errors='coerce')

In [None]:
df.dtypes

User                                    int64
Card                                    int64
Cards_Issued                            int64
Credit_Limit                          float64
Current_Age                             int64
Retirement_Age                          int64
Per_Capita_Income_-_Zipcode           float64
Yearly_Income_-_Person                float64
Total_Debt                            float64
FICO_Score                              int64
Num_Credit_Cards                        int64
Amount                                float64
Card_Brand                             object
Card_Type                              object
Has_Chip                               object
Card_on_Dark_Web                       object
Gender                                 object
City                                   object
State                                  object
Zipcode                                object
Use_Chip                               object
Merchant_City                     

In [None]:
df[lista_aux]

Unnamed: 0,Credit_Limit,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,Amount
0,8600.0,16935.0,34531.0,60156.0,0.70
1,57.0,17003.0,34671.0,55643.0,31.80
2,14231.0,17144.0,34954.0,5942.0,25.09
3,13350.0,16816.0,34287.0,77180.0,4.79
4,14600.0,20641.0,24055.0,0.0,67.62
...,...,...,...,...,...
975471,19103.0,21159.0,43144.0,54157.0,158.36
975472,19097.0,49868.0,101679.0,307856.0,110.22
975473,30900.0,40226.0,82019.0,64583.0,65.66
975474,22679.0,15447.0,31499.0,45661.0,14.06


###Limpieza de variables categoricas y tipo texto

In [None]:
# Limpieza de variables tipo texto

for v in vartxt:
    df[v]=df[v].map(lambda x: clean_text(x))
    
# Limpieza de variables categoricas
for v in vard:
    df[v]=df[v].map(lambda x: clean_cat(x))

###Completitud

In [None]:
df_completitud=completitud(df)
df_completitud

Unnamed: 0,variable,total,completitud
0,User,0,100.0
1,MCC,0,100.0
2,Errors?,0,100.0
3,tgt,0,100.0
4,Person,0,100.0
5,Address,0,100.0
6,Apartment,0,100.0
7,Card_Number,0,100.0
8,CVV,0,100.0
9,Zip,0,100.0


In [None]:
miss_drop=list(df_completitud[df_completitud['completitud']<80]['variable'])
df=df.drop(columns=miss_drop)

###Duplicados

In [None]:
# Revisamos si existen registros duplicados.

df[df.duplicated()]

Unnamed: 0,User,Card,Cards_Issued,Credit_Limit,Current_Age,Retirement_Age,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,...,Year,Time,Merchant_Name,Latitude,Longitude,Year_PIN_last_Changed,Birth_Year,Birth_Month,Expires,Acct_Open_Date


In [None]:
# Numero de duplicados

print(f"Número de duplicados general : { df.duplicated().sum()}")

Número de duplicados general : 0


###Precision

In [None]:
# Se verifica si existe una variable categorica que sea unitaria

for v in vard:
    unitarias(df,v)

Card_on_Dark_Web -- VARIABLE UNITARIA
Errors? -- VARIABLE UNITARIA
tgt -- VARIABLE UNITARIA


In [None]:
# Porcentaje de representatividad de una categoria dentro de
# cada variable discreta

for v in vard:
    display(unitarias_per(df,v))
    print("\n")

Unnamed: 0,Card_Brand_valores,%_aparicion
0,mastercard,53.63%
1,visa,36.87%
2,amex,6.56%
3,discover,2.93%






Unnamed: 0,Card_Type_valores,%_aparicion
0,debit,61.71%
1,credit,31.52%
2,debit (prepaid),6.77%






Unnamed: 0,Has_Chip_valores,%_aparicion
0,yes,89.87%
1,no,10.13%




Card_on_Dark_Web -- VARIABLE UNITARIA


Unnamed: 0,Card_on_Dark_Web_valores,%_aparicion
0,no,100.0%






Unnamed: 0,Gender_valores,%_aparicion
0,female,51.47%
1,male,48.53%






Unnamed: 0,City_valores,%_aparicion
0,houston,1.22%
1,miami,0.88%
2,brooklyn,0.76%
3,los angeles,0.73%
4,tucson,0.6%
...,...,...
1275,south park,0.0%
1276,cranston,0.0%
1277,alamogordo,0.0%
1278,tecate,0.0%






Unnamed: 0,State_valores,%_aparicion
0,ca,12.14%
1,tx,8.26%
2,fl,7.15%
3,ny,6.81%
4,oh,4.35%
5,il,3.88%
6,pa,3.77%
7,nc,3.65%
8,nj,3.26%
9,ga,2.95%






Unnamed: 0,Zipcode_valores,%_aparicion
0,95076,0.34%
1,10463,0.34%
2,43830,0.33%
3,98516,0.33%
4,98021,0.32%
...,...,...
1796,33709,0.0%
1797,70714,0.0%
1798,27834,0.0%
1799,77093,0.0%






Unnamed: 0,Use_Chip_valores,%_aparicion
0,swipe transaction,63.08%
1,chip transaction,25.78%
2,online transaction,11.13%






Unnamed: 0,Merchant_City_valores,%_aparicion
0,online,11.16%
1,houston,1.01%
2,los angeles,0.73%
3,miami,0.73%
4,brooklyn,0.63%
...,...,...
9412,north chelmsford,0.0%
9413,justice,0.0%
9414,rembert,0.0%
9415,blessing,0.0%






Unnamed: 0,Merchant_State_valores,%_aparicion
0,,11.16%
1,ca,10.62%
2,tx,7.32%
3,fl,6.0%
4,ny,5.93%
...,...,...
169,zambia,0.0%
170,iran,0.0%
171,belarus,0.0%
172,kosovo,0.0%






Unnamed: 0,Zip_valores,%_aparicion
0,,11.81%
1,98516.0,0.23%
2,43830.0,0.2%
3,95076.0,0.18%
4,94606.0,0.18%
...,...,...
18745,16316.0,0.0%
18746,27574.0,0.0%
18747,41129.0,0.0%
18748,46405.0,0.0%






Unnamed: 0,MCC_valores,%_aparicion
0,5411,11.75%
1,5499,10.99%
2,5541,10.78%
3,5812,7.35%
4,5912,5.8%
...,...,...
104,3008,0.0%
105,3144,0.0%
106,5733,0.0%
107,3075,0.0%




Errors? -- VARIABLE UNITARIA


Unnamed: 0,Errors?_valores,%_aparicion
0,,98.42%
1,insufficient balance,1.0%
2,bad pin,0.24%
3,technical glitch,0.19%
4,bad card number,0.05%
5,bad expiration,0.05%
6,bad cvv,0.04%
7,bad zipcode,0.01%
8,"bad pin,insufficient balance",0.0%
9,"insufficient balance,technical glitch",0.0%




tgt -- VARIABLE UNITARIA


Unnamed: 0,tgt_valores,%_aparicion
0,no,99.88%
1,yes,0.12%






In [None]:
#se eliminan las variables categoricas unitarias
columns_unit=['Card_on_Dark_Web','Errors?','tgt']
#Tambien los eliminamos de la variable vard
df=df.drop(columns=columns_unit)

for i in columns_unit:
  vard.remove[i]


In [None]:
df.head()


Unnamed: 0,User,Card,Cards_Issued,Credit_Limit,Current_Age,Retirement_Age,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,...,Year,Time,Merchant_Name,Latitude,Longitude,Year_PIN_last_Changed,Birth_Year,Birth_Month,Expires,Acct_Open_Date
0,1107,2,2,8600.0,32,65,16935.0,34531.0,60156.0,703,...,2020,16 36,6571010470072147219,41 0,76 85,2013,1987,11,2020-10-01,2013-09-01
1,1096,3,1,57.0,50,64,17003.0,34671.0,55643.0,786,...,2001,21 00,1288082279022882052,44 52,122 81,2017,1970,2,2020-12-01,1999-10-01
2,580,0,1,14231.0,67,71,17144.0,34954.0,5942.0,729,...,2012,18 17,6826708177432339862,41 89,84 04,2010,1952,10,2023-06-01,2002-10-01
3,1310,2,2,13350.0,48,66,16816.0,34287.0,77180.0,760,...,2018,07 37,4722913068560264812,30 81,92 65,2011,1971,9,2020-06-01,2006-02-01
4,1126,3,2,14600.0,84,69,20641.0,24055.0,0.0,733,...,2011,09 22,7641585028463831554,39 32,76 72,2013,1935,8,2021-10-01,2004-04-01


## Variables cuantitativas

In [None]:
df[varc].describe(percentiles=np.linspace(0.1,1,10))

Unnamed: 0,Card,Cards_Issued,Credit_Limit,Current_Age,Retirement_Age,Per_Capita_Income_-_Zipcode,Yearly_Income_-_Person,Total_Debt,FICO_Score,Num_Credit_Cards,Amount
count,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0,975476.0
mean,1.352351,1.525009,15249.549137,53.871847,66.381915,23947.979655,46594.644126,58457.585629,712.343032,3.684508,43.696332
std,1.405632,0.516283,12155.86341,15.840794,3.628544,11893.941767,24462.693893,52500.664177,66.984993,1.621417,81.990261
min,0.0,1.0,0.0,18.0,50.0,0.0,1.0,0.0,480.0,1.0,-500.0
10%,0.0,1.0,1866.0,35.0,62.0,14477.0,26511.0,1626.0,623.0,1.0,1.74
20%,0.0,1.0,6200.0,40.0,64.0,16234.0,31066.0,10793.0,671.0,2.0,6.12
30%,0.0,1.0,8900.0,44.0,65.0,17696.0,34441.0,23394.0,688.0,3.0,12.19
40%,1.0,1.0,11100.0,48.0,66.0,19115.0,37544.0,39078.0,701.0,3.0,19.86
50%,1.0,2.0,13325.0,52.0,66.0,21156.0,40848.0,51679.0,714.0,4.0,30.26
60%,1.0,2.0,15600.0,56.0,67.0,23487.0,45202.0,62695.0,729.0,4.0,42.02


## Variables categoricas

In [None]:
df[vard].describe()

Unnamed: 0,Card_Brand,Card_Type,Has_Chip,Gender,City,State,Zipcode,Use_Chip,Merchant_City,Merchant_State,Zip,MCC
count,975476,975476,975476,975476,975476,975476,975476,975476,975476,975476.0,975476.0,975476
unique,4,3,2,2,1280,51,1801,3,9417,174.0,18750.0,109
top,mastercard,debit,yes,female,houston,ca,95076,swipe transaction,online,,,5411
freq,523136,601931,876617,502124,11884,118411,3337,615365,108902,108902.0,115172.0,114606


In [None]:
df[vartxt].describe()

Unnamed: 0,Person,Address,Apartment,Card_Number,CVV,Month,Day,Year,Time,Merchant_Name,Latitude,Longitude,Year_PIN_last_Changed,Birth_Year,Birth_Month
count,975476,975476,975476.0,975476,975476,975476,975476,975476,975476,975476,975476,975476,975476,975476,975476
unique,1976,1982,198.0,5865,998,12,31,30,1440,32965,985,1216,19,80,12
top,beckett gonzalez,702 elm drive,,357731604070533,270,1,25,2018,13 06,1799189980464955940,29 76,95 38,2011,1970,11
freq,3337,3337,710174.0,2805,3628,86098,32583,68964,1281,45442,11884,13333,163683,33928,102850
