# Ejercicio Data Quality - Perfilado
## Evaluar la calidad de datos de las ventas de productos

Se quiere hacer una evaluación de calidad de datos sobre las ventas (sales) y pagos (payments). Para ello se requiere hacer un análisis de los siguientes puntos:
- Calidad de los datos
- Selección de clave principal
- Identificación de cardinalidad
- Obtener media, varianza y desviacion Estandar, covarianza, correlacion
- Mejorar la calidad.

**Referencia**: “Estadística Descriptiva con Python y Pandas”: https://coderhook.github.io/Descriptive%20Statistics

- Columnas sales:, orderNumber, orderLineNumber, orderDate, shippedDate, requiredDate, customerNumber, employeeNumber, productCode, status, comments, quantityOrdered, priceEach, sales_amount, origin

- Columnas payments:, customerNumber, checkNumber, paymentDate, amount

In [27]:
import pandas as pd
import numpy as np
import os
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

## Cargar archivos

In [11]:
sales_df = pd.read_csv('../Pandas/datos/company_sales/sales.csv')

In [13]:
payments_df = pd.read_csv('../Pandas/datos/company_sales/payments.csv')

In [14]:
# Mostrar las primeras filas de cada dataset
sales_df.head(), payments_df.head()

(       0  0.1  0000-00-00 0000-00-00.1 0000-00-00.2  0.2   0.3 productCode  \
 0  10100    1  0000-00-00   0000-00-00   0000-00-00  363  1216    S24_3969   
 1  10100    2  0000-00-00   0000-00-00   0000-00-00  363  1216    S18_2248   
 2  10100    3  0000-00-00   0000-00-00   0000-00-00  363  1216    S18_1749   
 3  10100    4  0000-00-00   0000-00-00   0000-00-00  363  1216    S18_4409   
 4  10101    1  0000-00-00   0000-00-00   0000-00-00  128  1504    S18_2795   
 
     status                comments  0.4    0.00   0.00.1 origin  
 0  Shipped                     NaN   49   35.29  1729.21  spain  
 1  Shipped                     NaN   50   55.09  2754.50  spain  
 2  Shipped                     NaN   30  136.00  4080.00  spain  
 3  Shipped                     NaN   22   75.46  1660.12  spain  
 4  Shipped  Check on availability.   26  167.06  4343.56  spain  ,
      0 checkNumber  0000-00-00      0.00
 0  103    HQ336336  2004-10-19   6066.78
 1  103    JM555205  2003-06-05  1457

In [15]:
# Renombrar columnas con nombres más adecuados para sales.csv
sales_df.columns = [
"orderNumber", "orderLineNumber", "orderDate", "shippedDate", "requiredDate",
"customerNumber", "EmployeeNumber", "productCode", "status", "comments",
"quantityOrdered", "priceEach", "sales_amount", "origin"
]
# Renombrar columnas para payments.csv
payments_df.columns = ["customerNumber", "checkNumber", "paymentDate", "amount"]

sales_df.head(), payments_df.head()

(   orderNumber  orderLineNumber   orderDate shippedDate requiredDate  \
 0        10100                1  0000-00-00  0000-00-00   0000-00-00   
 1        10100                2  0000-00-00  0000-00-00   0000-00-00   
 2        10100                3  0000-00-00  0000-00-00   0000-00-00   
 3        10100                4  0000-00-00  0000-00-00   0000-00-00   
 4        10101                1  0000-00-00  0000-00-00   0000-00-00   
 
    customerNumber  EmployeeNumber productCode   status  \
 0             363            1216    S24_3969  Shipped   
 1             363            1216    S18_2248  Shipped   
 2             363            1216    S18_1749  Shipped   
 3             363            1216    S18_4409  Shipped   
 4             128            1504    S18_2795  Shipped   
 
                  comments  quantityOrdered  priceEach  sales_amount origin  
 0                     NaN               49      35.29       1729.21  spain  
 1                     NaN               50     

In [16]:
# Revisar tipos de datos
sales_info = sales_df.dtypes
payments_info = payments_df.dtypes
sales_info, payments_info

(orderNumber          int64
 orderLineNumber      int64
 orderDate           object
 shippedDate         object
 requiredDate        object
 customerNumber       int64
 EmployeeNumber       int64
 productCode         object
 status              object
 comments            object
 quantityOrdered      int64
 priceEach          float64
 sales_amount       float64
 origin              object
 dtype: object,
 customerNumber      int64
 checkNumber        object
 paymentDate        object
 amount            float64
 dtype: object)

In [17]:
# Datos vacios
print('Datos vacios en sales.csv\n',sales_df.isna().sum())
print('\nDatos vacios en payments.csv\n',payments_df.isna().sum())

Datos vacios en sales.csv
 orderNumber           0
orderLineNumber       0
orderDate             0
shippedDate         142
requiredDate          0
customerNumber        0
EmployeeNumber        0
productCode           0
status                0
comments           2242
quantityOrdered       0
priceEach             0
sales_amount          0
origin                0
dtype: int64

Datos vacios en payments.csv
 customerNumber    0
checkNumber       0
paymentDate       0
amount            0
dtype: int64


In [18]:
# Verificamos fechas únicas en sales
print('\norderDate:',sales_df['orderDate'].unique())
print('\nshippedDate:',sales_df['shippedDate'].unique())
print('\nrequiredDate:',sales_df['requiredDate'].unique())


orderDate: ['0000-00-00' '2038-09-00']

shippedDate: ['0000-00-00' nan '2038-00-06' '2038-09-07']

requiredDate: ['0000-00-00' '2038-00-08' '2038-09-07']


In [19]:
# Contamos cuantas fechas hay de cada fecha única
print('orderDate:', sales_df['orderDate'].value_counts())
print('\nshippedDate:', sales_df['shippedDate'].value_counts())
print('\nrequiredDate:', sales_df['requiredDate'].value_counts())

orderDate: orderDate
0000-00-00    2998
2038-09-00       3
Name: count, dtype: int64

shippedDate: shippedDate
0000-00-00    2839
2038-00-06      17
2038-09-07       3
Name: count, dtype: int64

requiredDate: requiredDate
0000-00-00    2981
2038-00-08      17
2038-09-07       3
Name: count, dtype: int64


In [None]:
# Verificamos comments únicos en sales
print('\ncomments:',sales_df['comments'].unique())

In [None]:
# Contamos comments hay de cada comments único
print('comments:', sales_df['comments'].value_counts())

In [22]:
# Elimino en sales.csv columnas de fecha y comments ya que no tienen sentido

sales_df_clean = sales_df.drop(columns=['orderDate', 'shippedDate', 'requiredDate', 'comments'])
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   orderNumber      3001 non-null   int64  
 1   orderLineNumber  3001 non-null   int64  
 2   customerNumber   3001 non-null   int64  
 3   EmployeeNumber   3001 non-null   int64  
 4   productCode      3001 non-null   object 
 5   status           3001 non-null   object 
 6   quantityOrdered  3001 non-null   int64  
 7   priceEach        3001 non-null   float64
 8   sales_amount     3001 non-null   float64
 9   origin           3001 non-null   object 
dtypes: float64(2), int64(5), object(3)
memory usage: 234.6+ KB


In [23]:
# Comprobamos valores nulos en sales_df_clean
sales_df_clean.isna().sum()

orderNumber        0
orderLineNumber    0
customerNumber     0
EmployeeNumber     0
productCode        0
status             0
quantityOrdered    0
priceEach          0
sales_amount       0
origin             0
dtype: int64

In [24]:
# Convertir las columnas numéricas y fechas al tipo correcto categoricas
sales_df_clean["orderNumber"] = pd.to_numeric(sales_df_clean["orderNumber"], errors="coerce")
sales_df_clean["orderLineNumber"] = pd.to_numeric(sales_df_clean["orderLineNumber"], errors="coerce")
sales_df_clean["customerNumber"] = pd.to_numeric(sales_df_clean["customerNumber"], errors="coerce")
sales_df_clean["EmployeeNumber"] = pd.to_numeric(sales_df_clean["EmployeeNumber"], errors="coerce")
sales_df_clean["quantityOrdered"] = pd.to_numeric(sales_df_clean["quantityOrdered"], errors="coerce")
sales_df_clean["priceEach"] = pd.to_numeric(sales_df_clean["priceEach"], errors="coerce")
sales_df_clean["sales_amount"] = pd.to_numeric(sales_df_clean["sales_amount"], errors="coerce")
sales_df_clean[['productCode', 'status', 'origin']] = sales_df_clean[['productCode', 'status', 'origin']].astype('category')

# sales_df_clean["orderDate"] = pd.to_datetime(sales_df_clean["orderDate"], errors="coerce")
# sales_df_clean["requiredDate"] = pd.to_datetime(sales_df_clean["requiredDate"], errors="coerce")
# sales_df_clean["shippedDate"] = pd.to_datetime(sales_df_clean["shippedDate"], errors="coerce")

payments_df["customerNumber"] = pd.to_numeric(payments_df["customerNumber"], errors="coerce")
payments_df["amount"] = pd.to_numeric(payments_df["amount"], errors="coerce")
payments_df["paymentDate"] = pd.to_datetime(payments_df["paymentDate"], errors="coerce")

# Mostrar la información corregida
sales_df_clean.info(), payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   orderNumber      3001 non-null   int64   
 1   orderLineNumber  3001 non-null   int64   
 2   customerNumber   3001 non-null   int64   
 3   EmployeeNumber   3001 non-null   int64   
 4   productCode      3001 non-null   category
 5   status           3001 non-null   category
 6   quantityOrdered  3001 non-null   int64   
 7   priceEach        3001 non-null   float64 
 8   sales_amount     3001 non-null   float64 
 9   origin           3001 non-null   category
dtypes: category(3), float64(2), int64(5)
memory usage: 178.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   customerNumber  278 non-null    int64         
 1  

(None, None)

In [25]:
# # Convertir fechas en sales.csv
# date_columns_sales = ["orderDate", "shippedDate", "requiredDate"]
# for col in date_columns_sales:
#     sales_df[col] = pd.to_datetime(sales_df[col], errors='coerce') # Convierte y pone NaT en valores inválidos

# Convertir fecha en payments.csv
# payments_df["paymentDate"] = pd.to_datetime(payments_df["paymentDate"], errors='coerce')

# Verificamos si hay alguna fecha nula
# missing_dates_sales = sales_df[date_columns_sales].isnull().sum()
missing_dates_payments = payments_df["paymentDate"].isnull().sum()

# missing_dates_sales, missing_dates_payments

print('\nFechas nulas en payments:',missing_dates_payments)


Fechas nulas en payments: 0


In [44]:
# Valores atípicos con z-scores en sales

numeric_cols_sales = sales_df_clean.select_dtypes(include=[np.number])

z_scores = (numeric_cols_sales-numeric_cols_sales.mean(numeric_only=True)) / \
    numeric_cols_sales.std(numeric_only=True)
z_scores_abs = z_scores.apply(np.abs)
print(tabulate(z_scores_abs, headers='keys'))

        orderNumber    orderLineNumber    customerNumber    EmployeeNumber    quantityOrdered    priceEach    sales_amount
----  -------------  -----------------  ----------------  ----------------  -----------------  -----------  --------------
   0     1.73299              1.29252         0.872955           0.312397           1.4028     1.51659         0.904583
   1     1.73299              1.05424         0.872955           0.312397           1.50454    0.975299        0.276094
   2     1.73299              0.815971        0.872955           0.312397           0.530263   1.2366          0.536419
   3     1.73299              0.577698        0.872955           0.312397           1.34418    0.418428        0.946935
   4     1.72219              1.29252         1.11178            0.570109           0.937223   2.08572         0.697978
   5     1.72219              1.05424         1.11178            0.570109           1.09758    1.26891         0.714012
   6     1.72219              0.81

In [45]:
# Conteo de outliers (atípicos) sales

umbral = 3

out_mask = ~z_scores[z_scores_abs > umbral].isna()
print('\nOutliers per column:\n')
print(out_mask.sum())


Outliers per column:

orderNumber          0
orderLineNumber      0
customerNumber       0
EmployeeNumber     137
quantityOrdered     17
priceEach           20
sales_amount        36
dtype: int64


In [50]:
# Valores atípicos con z-scores en payments

numeric_cols_payments = payments_df.select_dtypes(include=[np.number])

z_scores_pay = (numeric_cols_payments-numeric_cols_payments.mean(numeric_only=True)) / \
    numeric_cols_payments.std(numeric_only=True)
z_scores_abs_pay = z_scores_pay.apply(np.abs)
print(tabulate(z_scores_abs_pay, headers='keys'))

       customerNumber      amount
---  ----------------  ----------
  0         1.40026    1.22113
  1         1.40026    0.817993
  2         1.40026    1.42926
  3         1.32545    0.836021
  4         1.32545    0.0385869
  5         1.32545    0.072048
  6         1.30883    0.665339
  7         1.30883    2.39064
  8         1.30883    1.15011
  9         1.30883    0.619393
 10         1.26727    0.584283
 11         1.26727    0.762995
 12         1.26727    0.838813
 13         1.25065    0.871771
 14         1.25065    1.43801
 15         1.25065    0.661335
 16         1.25065    0.133209
 17         1.22571    3.29049
 18         1.22571    2.53994
 19         1.22571    0.985187
 20         1.22571    2.45401
 21         1.22571    0.72595
 22         1.22571    1.12872
 23         1.22571    3.78394
 24         1.22571    0.547084
 25         1.22571    0.628382
 26         1.19247    1.00866
 27         1.19247    0.366234
 28         1.19247    0.0944569
 29         1.

In [51]:
# Conteo de outliers (atípicos) payments

umbral = 3

out_mask_pay = ~z_scores_pay[z_scores_abs_pay > umbral].isna()
print('\nOutliers per column:\n')
print(out_mask_pay.sum())


Outliers per column:

customerNumber    0
amount            5
dtype: int64


In [46]:
sales_df_clean.describe()

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,EmployeeNumber,quantityOrdered,priceEach,sales_amount
count,3001.0,3001.0,3001.0,3001.0,3001.0,3001.0,3001.0
mean,10260.509164,6.424525,259.63912,1317.948684,35.211929,90.765831,3204.908437
std,92.61975,4.19687,118.403435,326.343575,9.828957,36.579368,1631.356967
min,10100.0,1.0,103.0,0.0,6.0,26.55,481.5
25%,10181.0,3.0,145.0,1216.0,27.0,62.0,1988.7
50%,10263.0,6.0,240.0,1370.0,35.0,85.76,2880.48
75%,10339.0,9.0,353.0,1501.0,43.0,114.65,4093.6
max,10425.0,18.0,496.0,1702.0,97.0,214.3,11503.14


In [36]:
payments_df.describe()

Unnamed: 0,customerNumber,paymentDate,amount
count,278.0,278,278.0
mean,271.467626,2004-05-06 18:12:56.978417280,31827.944281
min,103.0,2003-01-16 00:00:00,615.45
25%,161.0,2003-11-18 06:00:00,15144.135
50%,253.0,2004-05-14 12:00:00,31369.15
75%,363.0,2004-11-17 18:00:00,45036.97
max,496.0,2005-06-09 00:00:00,120166.58
std,120.311647,,21096.143249


In [47]:
# Conteo outliers en quantityOrdered

outliers_sales = sales_df_clean['quantityOrdered'][out_mask['quantityOrdered']]
print('Outliers:\n', outliers_sales)

Outliers:
 2812    77
2813    85
2824    66
2827    66
2831    77
2833    90
2838    76
2840    97
2841    65
2845    76
2847    66
2849    76
2866    65
2887    70
2930    66
2952    70
2964    66
Name: quantityOrdered, dtype: int64


In [52]:
# Conteo outliers en quantityOrdered

outliers_pay = payments_df['amount'][out_mask_pay['amount']]
print('Outliers:\n', outliers_pay)

Outliers:
 17    101244.59
23    111654.40
41    116208.40
43    120166.58
61    105743.00
Name: amount, dtype: float64


In [48]:
# Elimino los outliers de sales

sales_df_clean.drop(outliers_sales, inplace=True)
sales_df_clean.shape

(2993, 10)

In [None]:
# Elimino los outliers de payemnts
# No se puede
# payments_df.drop(outliers_pay, inplace=True)
# payments_df.shape

In [54]:
# Mostrar todas las filas duplicadas en sales_df_clean
print('\nSumatorio duplicados en sales', sales_df_clean.duplicated().sum())
sales_df_clean[sales_df_clean.duplicated(keep=False)]



Sumatorio duplicados en sales 5


Unnamed: 0,orderNumber,orderLineNumber,customerNumber,EmployeeNumber,productCode,status,quantityOrdered,priceEach,sales_amount,origin
27,10104,2,141,1370,S50_1514,Shipped,32,53.31,1705.92,spain
28,10104,2,141,1370,S50_1514,Shipped,32,53.31,1705.92,spain
2860,10410,2,357,1612,S18_3136,Shipped,34,84.82,2883.88,spain
2861,10410,2,357,1612,S18_3136,Shipped,34,84.82,2883.88,spain
2894,10413,6,175,1323,S32_3207,Shipped,24,56.55,1357.2,spain
2895,10413,6,175,1323,S32_3207,Shipped,24,56.55,1357.2,spain
2944,10419,1,382,1401,S18_1589,Shipped,37,100.8,3729.6,spain
2945,10419,1,382,1401,S18_1589,Shipped,37,100.8,3729.6,spain
2989,10425,3,119,1370,S18_2238,In Process,28,147.36,4126.08,spain
2990,10425,3,119,1370,S18_2238,In Process,28,147.36,4126.08,spain


In [55]:
# Generar una columna con orderNumber y orderLineNumber

sales_df_clean['complete_order_number'] = sales_df_clean['orderNumber'].astype('str')+'-'+sales_df_clean['orderLineNumber'].astype('str')

In [56]:
sales_df_clean.head()

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,EmployeeNumber,productCode,status,quantityOrdered,priceEach,sales_amount,origin,complete_order_number
0,10100,1,363,1216,S24_3969,Shipped,49,35.29,1729.21,spain,10100-1
1,10100,2,363,1216,S18_2248,Shipped,50,55.09,2754.5,spain,10100-2
2,10100,3,363,1216,S18_1749,Shipped,30,136.0,4080.0,spain,10100-3
3,10100,4,363,1216,S18_4409,Shipped,22,75.46,1660.12,spain,10100-4
4,10101,1,128,1504,S18_2795,Shipped,26,167.06,4343.56,spain,10101-1


In [60]:
dup_ordnums = sales_df_clean[sales_df_clean.duplicated()]['complete_order_number']
dup_ordnums.values

array(['10104-2', '10410-2', '10413-6', '10419-1', '10425-3'],
      dtype=object)

In [61]:
sales_df_clean[sales_df_clean['complete_order_number'].isin(dup_ordnums.values)]

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,EmployeeNumber,productCode,status,quantityOrdered,priceEach,sales_amount,origin,complete_order_number
27,10104,2,141,1370,S50_1514,Shipped,32,53.31,1705.92,spain,10104-2
28,10104,2,141,1370,S50_1514,Shipped,32,53.31,1705.92,spain,10104-2
2860,10410,2,357,1612,S18_3136,Shipped,34,84.82,2883.88,spain,10410-2
2861,10410,2,357,1612,S18_3136,Shipped,34,84.82,2883.88,spain,10410-2
2894,10413,6,175,1323,S32_3207,Shipped,24,56.55,1357.2,spain,10413-6
2895,10413,6,175,1323,S32_3207,Shipped,24,56.55,1357.2,spain,10413-6
2944,10419,1,382,1401,S18_1589,Shipped,37,100.8,3729.6,spain,10419-1
2945,10419,1,382,1401,S18_1589,Shipped,37,100.8,3729.6,spain,10419-1
2989,10425,3,119,1370,S18_2238,In Process,28,147.36,4126.08,spain,10425-3
2990,10425,3,119,1370,S18_2238,In Process,28,147.36,4126.08,spain,10425-3


In [57]:
# Mostrar todas las filas duplicadas en payments_df
print('\nSumatorio duplicados en payments', payments_df.duplicated().sum())
payments_df[payments_df.duplicated(keep=False)]


Sumatorio duplicados en payments 5


Unnamed: 0,customerNumber,checkNumber,paymentDate,amount
31,129,ID449593,2003-12-11,13923.93
32,129,ID449593,2003-12-11,13923.93
85,175,CITI3434344,2005-05-19,14500.78
86,175,CITI3434344,2005-05-19,14500.78
143,260,IO164641,2004-08-30,13527.58
144,260,IO164641,2004-08-30,13527.58
214,381,GB117430,2005-02-03,7379.9
215,381,GB117430,2005-02-03,7379.9
268,487,AH612904,2003-09-28,14997.09
269,487,AH612904,2003-09-28,14997.09


In [62]:
# Elimino filas duplicadas en sales_df_clean y payments dejando la primera que aparece

sales_df_cleaned = sales_df_clean.drop_duplicates(keep='last')
payments_df_cleaned = payments_df.drop_duplicates(keep='last')

print('\nSumatorio duplicados en sales_df_cleaned', sales_df_cleaned.duplicated().sum())
print('\nSumatorio duplicados en payments_df_cleaned', payments_df_cleaned.duplicated().sum())


Sumatorio duplicados en sales_df_cleaned 0

Sumatorio duplicados en payments_df_cleaned 0


In [63]:
# Muestro la información de los df limpios

sales_df_cleaned.info(), payments_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2988 entries, 0 to 3000
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   orderNumber            2988 non-null   int64   
 1   orderLineNumber        2988 non-null   int64   
 2   customerNumber         2988 non-null   int64   
 3   EmployeeNumber         2988 non-null   int64   
 4   productCode            2988 non-null   category
 5   status                 2988 non-null   category
 6   quantityOrdered        2988 non-null   int64   
 7   priceEach              2988 non-null   float64 
 8   sales_amount           2988 non-null   float64 
 9   origin                 2988 non-null   category
 10  complete_order_number  2988 non-null   object  
dtypes: category(3), float64(2), int64(5), object(1)
memory usage: 224.1+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, 0 to 277
Data columns (total 4 columns):
 #   Column          Non-Null Cou

(None, None)

### Cardinalidad

In [None]:
# Calcular el sumatorio de valores para cada columna
sum_values_sales = sales_df_cleaned.count()

# Calcular el sumatorio de valores únicos para cada columna
unique_sum_sales = sales_df_cleaned.nunique()

# Crear un nuevo DataFrame para mostrar ambos sumatorios
result_df_sales = pd.DataFrame({
    'Sumatorio de valores': sum_values_sales,
    'Sumatorio de valores únicos': unique_sum_sales
})

print(result_df_sales)

In [None]:
# Calcular el sumatorio de valores para cada columna
sum_values_pay = payments_df_cleaned.count()

# Calcular el sumatorio de valores únicos para cada columna
unique_sum_pay = payments_df_cleaned.nunique()

# Crear un nuevo DataFrame para mostrar ambos sumatorios
result_df_pay = pd.DataFrame({
    'Sumatorio de valores': sum_values_pay,
    'Sumatorio de valores únicos': unique_sum_pay
})

print(result_df_pay)

In [None]:
# # Evaluar unicidad de posibles claves primarias en cada dataset

# # Para sales.csv, posibles claves: orderNumber, orderLineNumber (combinación)
# sales_unique_order = sales_df_cleaned["orderNumber"].nunique()
# sales_total_rows = len(sales_df_cleaned)
# sales_unique_combination = sales_df_cleaned[["orderNumber", "orderLineNumber"]].duplicated().sum() # Chequear duplicados

# # Para payments.csv, posibles claves: checkNumber (supuestamente única)
# payments_unique_check = payments_df_cleaned["checkNumber"].nunique()
# payments_total_rows = len(payments_df_cleaned)
# payments_duplicated_checks = payments_total_rows - payments_unique_check

# print(sales_unique_order, sales_unique_combination, payments_unique_check,
# payments_duplicated_checks)

In [None]:
# Evaluar la unicidad de posibles claves primarias

# Crear una lista para almacenar los resultados
unique_keys_sales = []

# Iterar sobre cada columna del DataFrame
for column in sales_df_cleaned.columns:
    # Verificar si la columna tiene valores únicos
    if sales_df_cleaned[column].is_unique:
        unique_keys_sales.append(column)

# Mostrar las posibles claves primarias
print("Posibles claves primarias en sales_df_cleaned:")
print(unique_keys_sales)

In [None]:
# Evaluar la unicidad de posibles claves primarias

# Crear una lista para almacenar los resultados
unique_keys_pay = []

# Iterar sobre cada columna del DataFrame
for column in payments_df_cleaned.columns:
    # Verificar si la columna tiene valores únicos
    if payments_df_cleaned[column].is_unique:
        unique_keys_pay.append(column)

# Mostrar las posibles claves primarias
print("Posibles claves primarias en payments_df_cleaned:")
print(unique_keys_pay)

In [None]:
# Evaluar la relación entre sales y payments a través de customerNumber

# Contar clientes únicos en cada dataset
unique_customers_sales = sales_df_cleaned["customerNumber"].nunique()
unique_customers_payments = payments_df_cleaned["customerNumber"].nunique()

# Contar clientes comunes entre ambos datasets
common_customers = len(set(sales_df_cleaned["customerNumber"]).intersection(set(payments_df_cleaned["customerNumber"])))
unique_customers_sales, unique_customers_payments, common_customers

In [None]:
# Evaluar la relación entre sales_df_cleaned y payments_df_cleaned a través de customerNumber

# Verificar si customerNumber es una columna en ambos DataFrames
if 'customerNumber' in sales_df_cleaned.columns and 'customerNumber' in payments_df_cleaned.columns:
    # Realizar un merge (join) entre ambos DataFrames usando customerNumber como clave
    merged_df = pd.merge(sales_df_cleaned, payments_df_cleaned, on='customerNumber', how='inner')
    
    # Mostrar el DataFrame resultante del merge
    print("DataFrame resultante del merge entre sales_df_cleaned y payments_df_cleaned:")
    print(merged_df)
else:
    print("La columna 'customerNumber' no está presente en uno o ambos DataFrames.")

In [None]:
# Datos vacios
print('Datos vacios\n',merged_df.isna().sum())

In [None]:
merged_df.info()

In [None]:
# Calcular estadísticas descriptivas en variables numéricas
stats_sales = sales_df_cleaned.describe().T
stats_payments = payments_df_cleaned[["checkNumber", "amount"]].describe().T

# Calcular varianza y desviación estándar
variance_sales = sales_df_cleaned[["quantityOrdered", "priceEach", "sales_amount"]].var()
std_dev_sales = sales_df_cleaned[["quantityOrdered", "priceEach", "sales_amount"]].std()
variance_payments = payments_df_cleaned[["amount"]].var()
std_dev_payments = payments_df_cleaned[["amount"]].std()

# Calcular covarianza entre variables en sales.csv
covariance_sales = sales_df_cleaned[["quantityOrdered", "priceEach", "sales_amount"]].cov()

# Calcular correlación entre variables en sales.csv
correlation_sales = sales_df_cleaned[["quantityOrdered", "priceEach", "sales_amount"]].corr()

# Mostrar los resultados en consola usando print
print("=== Estadísticas de Sales ===")
print(stats_sales)
print("\n=== Estadísticas de Payments ===")
print(stats_payments)
print("\n=== Covarianza en Sales ===")
print(covariance_sales)
print("\n=== Correlación en Sales ===")
print(correlation_sales)

In [None]:
# Longitud de filas

Aclean_sales_rows = len(sales_df_cleaned)
Alean_payments_rows = len(payments_df_cleaned)
Alean_merge_rows = len(merged_df)
Aclean_sales_rows, Alean_payments_rows, Alean_merge_rows

In [None]:
# Obtener la ruta del directorio actual
current_directory = os.getcwd()

print("La ruta del directorio actual es:", current_directory)

In [None]:
# Extraer los DataFrame a un archivo CSV

sales_df_cleaned.to_csv('../Pandas/datos/company_sales/sales_df_cleaned.csv', index=False)
payments_df_cleaned.to_csv('../Pandas/datos/company_sales/payments_df_cleaned.csv', index=False)
merged_df.to_csv('../Pandas/datos/company_sales/merged_df.csv', index=False)

print("El DataFrame sales_df_cleaned, payments_df_cleaned y merged_df se ha extraído a *.csv")

In [None]:
# Crear un archivo Excel con cada DataFrame en hojas distintas y con carácter de tabulación ","

# Crear un objeto ExcelWriter
with pd.ExcelWriter('../Pandas/datos/company_sales/dataframes.xlsx', engine='xlsxwriter') as writer:
    # Escribir cada DataFrame en una hoja distinta
    sales_df.to_excel(writer, sheet_name='sales_df', index=False)
    payments_df.to_excel(writer, sheet_name='payments_df', index=False)
    sales_df_cleaned.to_excel(writer, sheet_name='sales_df_cleaned', index=False)
    payments_df_cleaned.to_excel(writer, sheet_name='payments_df_cleaned', index=False)
    merged_df.to_excel(writer, sheet_name='merged_df_cleaned', index=False)

print("Los DataFrames se han extraído a 'dataframes.xlsx' con cada DataFrame en hojas distintas.")