# 1. Manejo de valores faltantes con NumPy

In [16]:
import pandas as pd
import numpy as np

df = pd.read_csv("OrderLine_v4.csv", encoding = "latin1")

#He puesto replace porque con where me lo transforma en un array de numpy y me muestra el dataframe feo.
df.replace(0, np.nan, inplace=True)
data = np.array(df["productValue"])

nan_mask = np.isnan(data)  
print("¿Dónde hay valores NaN?:", nan_mask)

mean_value = np.nanmean(data) 
print("Media sin contar los NaN:", mean_value)

median_value = np.nanmedian(data)  
std_value = np.nanstd(data)  
print("Mediana:", median_value, "| Desviación estándar:", std_value)

# 3. np.where(cond, x, y) - Reemplazar valores NaN por la media
data_filled = np.where(np.isnan(data), mean_value, data)
print("Array con NaN reemplazados por la media:", data_filled)

# np.nan_to_num() - Reemplaza NaN con 0 (o un valor especificado)
data_zero_filled = np.nan_to_num(data, nan=0)
print("Array con NaN reemplazados por 0:", data_zero_filled)

# arr[~np.isnan(arr)] - Filtrar valores eliminando NaN
data_clean = data[~np.isnan(data)]
print("Array sin valores NaN:", data_clean)

df["productValueSinNulos"] = data_filled

df

¿Dónde hay valores NaN?: [False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False  True  True False  True  True False  True  True
 False  True  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False F

Unnamed: 0,orderIdentifier,orderLineNumber,orderType,product.partNumber,shipFromInstructionLocation.locationIdentifier,shipToLocation.locationIdentifier,status,createdDate,requestedShipDate,requestedDeliveryDate,plannedShipDate,plannedDeliveryDate,quantity,quantityUnits,productValue,value,valueCurrency,shipmentCount,productValueSinNulos
0,100044323,100,OUTBOUND,PS-SL-B122,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,2021-09-03T00:00:00,2021-09-06T00:00:00,250,EA,1250.0,312500.0,USD,,1250.000000
1,100044323,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,2021-09-03T00:00:00,2021-09-06T00:00:00,250,EA,,,USD,,483.728873
2,100044323,300,OUTBOUND,PS-SL-INFO,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,2021-09-03T00:00:00,2021-09-06T00:00:00,250,EA,,,USD,,483.728873
3,100044324,100,OUTBOUND,PS-SL-F343,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,132,EA,1250.0,165000.0,USD,,1250.000000
4,100044324,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,132,EA,,,USD,,483.728873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,1000567497,100,INBOUND,9665306,Interalloy - Bozeman,LT-1,,2021-07-15T00:00:00,2021-07-26T00:00:00,2021-07-26T00:00:00,2021-07-21T00:00:00,,44,EA,3.0,132.0,USD,,3.000000
348,1000567500,100,INBOUND,2530020,McGrey - Mt Sterling,LT-1,,2021-07-08T00:00:00,2021-07-19T00:00:00,2021-07-19T00:00:00,2021-07-15T00:00:00,2021-07-19T00:00:00,56,EA,1180.0,66080.0,USD,,1180.000000
349,100044322,100,OUTBOUND,PS-SL-A287,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,400,EA,1250.0,500000.0,USD,,1250.000000
350,100044322,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,400,EA,,,USD,,483.728873


# 2. Manejo de valores faltantes con Pandas

In [None]:
import pandas as pd
import numpy as np

#Leer el archivo CSV
df = pd.read_csv("OrderLine_v4.csv", encoding="latin1")

#Reemplazar ceros por NaN
df.replace(0, np.nan, inplace=True)

#Detectar valores NaN
print("¿Dónde hay valores NaN?")
print(df.isna())  # Devuelve True donde hay NaN y False donde hay datos

#Eliminar filas con valores NaN
df_dropped = df.dropna()
print("\nDataFrame después de eliminar filas con NaN:")
print(df_dropped)

#Eliminar columnas con NaN usando axis=1
df_dropped_cols = df.dropna(axis=1)
print("\nDataFrame después de eliminar columnas con NaN:")
print(df_dropped_cols)


#Rellenar NaN con la media
df["productValueSinNulos_pandas_mean"] = df["productValue"].fillna(df["productValue"].mean())

#Rellenar NaN con la mediana
df["productValueSinNulos_pandas_median"] = df["productValue"].fillna(df["productValue"].median())

#Rellenar NaN con la moda
df["productValueSinNulos_pandas_mode"] = df["productValue"].fillna(df["productValue"].mode().iloc[0])

#Rellenar NaN con el último valor válido (Forward Fill)
df["productValueSinNulos_pandas_ffill"] = df["productValue"].fillna(method="ffill")

#Rellenar NaN con el siguiente valor válido (Backward Fill)
df["productValueSinNulos_pandas_bfill"] = df["productValue"].fillna(method="bfill")

df

¿Dónde hay valores NaN?
     orderIdentifier  orderLineNumber  orderType  product.partNumber  \
0              False            False      False               False   
1              False            False      False               False   
2              False            False      False               False   
3              False            False      False               False   
4              False            False      False               False   
..               ...              ...        ...                 ...   
347            False            False      False               False   
348            False            False      False               False   
349            False            False      False               False   
350            False            False      False               False   
351            False            False      False               False   

     shipFromInstructionLocation.locationIdentifier  \
0                                             False   
1

  df["productValueSinNulos_pandas_ffill"] = df["productValue"].fillna(method="ffill")
  df["productValueSinNulos_pandas_bfill"] = df["productValue"].fillna(method="bfill")


Unnamed: 0,orderIdentifier,orderLineNumber,orderType,product.partNumber,shipFromInstructionLocation.locationIdentifier,shipToLocation.locationIdentifier,status,createdDate,requestedShipDate,requestedDeliveryDate,...,quantityUnits,productValue,value,valueCurrency,shipmentCount,productValueSinNulos_pandas_mean,productValueSinNulos_pandas_median,productValueSinNulos_pandas_mode,productValueSinNulos_pandas_ffill,productValueSinNulos_pandas_bfill
0,100044323,100,OUTBOUND,PS-SL-B122,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,EA,1250.0,312500.0,USD,,1250.000000,1250.0,1250.0,1250.0,1250.0
1,100044323,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,EA,,,USD,,483.728873,183.0,1250.0,1250.0,1250.0
2,100044323,300,OUTBOUND,PS-SL-INFO,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,EA,,,USD,,483.728873,183.0,1250.0,1250.0,1250.0
3,100044324,100,OUTBOUND,PS-SL-F343,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,...,EA,1250.0,165000.0,USD,,1250.000000,1250.0,1250.0,1250.0,1250.0
4,100044324,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,...,EA,,,USD,,483.728873,183.0,1250.0,1250.0,1250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,1000567497,100,INBOUND,9665306,Interalloy - Bozeman,LT-1,,2021-07-15T00:00:00,2021-07-26T00:00:00,2021-07-26T00:00:00,...,EA,3.0,132.0,USD,,3.000000,3.0,3.0,3.0,3.0
348,1000567500,100,INBOUND,2530020,McGrey - Mt Sterling,LT-1,,2021-07-08T00:00:00,2021-07-19T00:00:00,2021-07-19T00:00:00,...,EA,1180.0,66080.0,USD,,1180.000000,1180.0,1180.0,1180.0,1180.0
349,100044322,100,OUTBOUND,PS-SL-A287,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,...,EA,1250.0,500000.0,USD,,1250.000000,1250.0,1250.0,1250.0,1250.0
350,100044322,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,...,EA,,,USD,,483.728873,183.0,1250.0,1250.0,


# 3. Manejo de valores faltantes con Scikit-learn

In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Leer el archivo CSV
df = pd.read_csv("OrderLine_v4.csv", encoding="latin1")

# Reemplazar ceros por NaN
df.replace(0, np.nan, inplace=True)

# Extraer solo la columna "productValue" para imputación
product_value = df[["productValue"]]

# Detectar valores NaN en la columna
print("\n¿Dónde hay valores NaN en 'productValue'?:")
print(product_value.isna())

# Rellenar NaN con la media usando Scikit-learn
imputer_mean = SimpleImputer(strategy="mean")  
df["productValueSinNulos_sklearn_mean"] = imputer_mean.fit_transform(product_value)

print("\nDataFrame con NaN reemplazados por la media usando Scikit-learn:")
print(df)

# Rellenar NaN con la mediana
imputer_median = SimpleImputer(strategy="median")
df["productValueSinNulos_sklearn_median"] = imputer_median.fit_transform(product_value)

print("\nDataFrame con NaN reemplazados por la mediana usando Scikit-learn:")
print(df)

# Rellenar NaN con la moda
imputer_mode = SimpleImputer(strategy="most_frequent")
df["productValueSinNulos_sklearn_mode"] = imputer_mode.fit_transform(product_value)

print("\nDataFrame con NaN reemplazados por la moda usando Scikit-learn:")
print(df)

# Rellenar NaN con un valor constante (ejemplo: 0)
imputer_const = SimpleImputer(strategy="constant", fill_value=0)
df["productValueSinNulos_sklearn_const"] = imputer_const.fit_transform(product_value)

print("\nDataFrame con NaN reemplazados por un valor constante usando Scikit-learn:")
df


¿Dónde hay valores NaN en 'productValue'?:
     productValue
0           False
1            True
2            True
3           False
4            True
..            ...
347         False
348         False
349         False
350          True
351          True

[352 rows x 1 columns]

DataFrame con NaN reemplazados por la media usando Scikit-learn:
     orderIdentifier  orderLineNumber orderType product.partNumber  \
0          100044323              100  OUTBOUND         PS-SL-B122   
1          100044323              200  OUTBOUND          PS-SL-KIT   
2          100044323              300  OUTBOUND         PS-SL-INFO   
3          100044324              100  OUTBOUND         PS-SL-F343   
4          100044324              200  OUTBOUND          PS-SL-KIT   
..               ...              ...       ...                ...   
347       1000567497              100   INBOUND            9665306   
348       1000567500              100   INBOUND            2530020   
349        100044322

Unnamed: 0,orderIdentifier,orderLineNumber,orderType,product.partNumber,shipFromInstructionLocation.locationIdentifier,shipToLocation.locationIdentifier,status,createdDate,requestedShipDate,requestedDeliveryDate,...,quantity,quantityUnits,productValue,value,valueCurrency,shipmentCount,productValueSinNulos_sklearn_mean,productValueSinNulos_sklearn_median,productValueSinNulos_sklearn_mode,productValueSinNulos_sklearn_const
0,100044323,100,OUTBOUND,PS-SL-B122,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,250,EA,1250.0,312500.0,USD,,1250.000000,1250.0,1250.0,1250.0
1,100044323,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,250,EA,,,USD,,483.728873,183.0,1250.0,0.0
2,100044323,300,OUTBOUND,PS-SL-INFO,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-31T00:00:00,2021-09-06T00:00:00,...,250,EA,,,USD,,483.728873,183.0,1250.0,0.0
3,100044324,100,OUTBOUND,PS-SL-F343,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,...,132,EA,1250.0,165000.0,USD,,1250.000000,1250.0,1250.0,1250.0
4,100044324,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2021-07-20T00:00:00,2021-08-19T00:00:00,2021-08-22T00:00:00,...,132,EA,,,USD,,483.728873,183.0,1250.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,1000567497,100,INBOUND,9665306,Interalloy - Bozeman,LT-1,,2021-07-15T00:00:00,2021-07-26T00:00:00,2021-07-26T00:00:00,...,44,EA,3.0,132.0,USD,,3.000000,3.0,3.0,3.0
348,1000567500,100,INBOUND,2530020,McGrey - Mt Sterling,LT-1,,2021-07-08T00:00:00,2021-07-19T00:00:00,2021-07-19T00:00:00,...,56,EA,1180.0,66080.0,USD,,1180.000000,1180.0,1180.0,1180.0
349,100044322,100,OUTBOUND,PS-SL-A287,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,...,400,EA,1250.0,500000.0,USD,,1250.000000,1250.0,1250.0,1250.0
350,100044322,200,OUTBOUND,PS-SL-KIT,LT-2,FAA - Washington,,2017-07-20T00:00:00,2021-08-11T00:00:00,2021-08-14T00:00:00,...,400,EA,,,USD,,483.728873,183.0,1250.0,0.0
