In [None]:
import pandas as pd
import plotly.express as px

orders_1 = pd.read_csv(r'/home/juandi/Documents/TOM_JETSON/Data/orders_export_1.csv')
orders_2 = pd.read_csv(r'/home/juandi/Documents/TOM_JETSON/Data/orders_export_2.csv')
orders_3 = pd.read_csv(r'/home/juandi/Documents/TOM_JETSON/Data/orders_export_3.csv')

order_total = pd.concat([orders_1, orders_2, orders_3], ignore_index=True)

top_brands = order_total['Vendor'].value_counts().nlargest(20).reset_index()
top_brands.columns = ['Vendor', 'Order Count']

fig = px.bar(
    top_brands,
    x='Vendor',
    y='Order Count',
    title='Top 20 Most Purchased Brands Historically',
    labels={'Brand': 'Brand', 'Order Count': 'Number of Orders'},
    text='Order Count'  
)

fig.update_layout(
    height=600,
    width=1000,
    xaxis_tickangle=-45, 
    plot_bgcolor='rgba(0,0,0,0)',
    yaxis=dict(showgrid=True, gridcolor='lightgrey')
)

fig.show()


In [3]:
order_total.to_csv(r'total_orders.csv')

In [11]:
import pandas as pd
import re

def arreglar_correo(email):
    if pd.isna(email):
        return None
    
    email = str(email).strip().lower()
    
    if '@' not in email:
        return None
        
    local, dominio = email.split('@')
    
    if '.' not in dominio:
        email = f"{local}@{dominio}.com"
        
    return email

def arreglar_telefono(numero):
    if pd.isna(numero):
        return None
    
    numero = str(numero).strip()
    numero = re.sub(r'[^\d]', '', numero)
    
    if len(numero) == 11 and numero.startswith('1'):
        numero = numero[1:]
    
    if len(numero) != 10:
        return None
    
    numero = f"+1{numero}"
    
    return numero

def procesar_archivo():
    ruta_entrada = '/home/juandi/Documents/TOM_JETSON/List Export 2024-11-07.csv'
    ruta_salida = '/home/juandi/Documents/TOM_JETSON/Data/cleaned_data_for_klaviyo_premise.csv'
    
    try:
        df = pd.read_csv(ruta_entrada)
        registros_originales = len(df)
        
        df['Email'] = df['Email'].apply(arreglar_correo)
        df['Phone Number'] = df['Phone Number'].apply(arreglar_telefono)  
        
        df = df.dropna(subset=['Email', 'Phone Number'], how='all') 
        df = df.drop_duplicates(subset=['Email', 'Phone Number'], keep='first')  
        
        columnas_necesarias = ['Email', 'Phone Number', 'First Name', 'Last Name', 
                             'City', 'State / Region', 'Country', 'Zip Code','premise label']
        df_final = df[columnas_necesarias]
        
        df_final.to_csv(ruta_salida, index=False)
        
        registros_finales = len(df_final)
        print("\nEstadísticas del procesamiento:")
        print(f"Registros originales: {registros_originales}")
        print(f"Registros procesados: {registros_finales}")
        print(f"Registros eliminados: {registros_originales - registros_finales}")
        
        print("\nEjemplos de registros procesados:")
        print(df_final.head())
        
        print(f"\nArchivo guardado en: {ruta_salida}")
        
    except Exception as e:
        print(f"Error al procesar el archivo: {str(e)}")

if __name__ == "__main__":
    procesar_archivo()


Estadísticas del procesamiento:
Registros originales: 7654
Registros procesados: 7654
Registros eliminados: 0

Ejemplos de registros procesados:
                         Email  Phone Number First Name  Last Name  \
0                zwicky@me.com  +19723331116     #12722     Murphy   
1  zuzanna.niewinski@gmail.com  +16305701407     #17721  Niewinski   
2            zsal2@comcast.net  +16309263185     #21606     Salata   
3       susiewhite84@gmail.com  +13122131117     #25656      White   
4       zoehappy1021@gmail.com          None   xu yiwen        NaN   

            City State / Region        Country    Zip Code premise label  
0         Dallas             TX  United States  75209-5117      In-store  
1        Chicago             IL  United States       60514      In-store  
2  Downers Grove             IL  United States       60516      In-store  
3     Glen Ellyn       Illinois  United States       60137      In-store  
4            NaN            NaN            NaN         NaN