# Limpieza de datos

## Valores perdidos

In [None]:
import pandas as pd
df = pd.read_csv('../data/housing.csv')

df.info()
df.describe()

In [None]:
df.isna().sum()

In [None]:
dfdel = df.dropna()
dfdel.info()

In [None]:
media = df['total_bedrooms'].median()
print(media)
df['total_bedrooms'].fillna(media, inplace=True)
df.info()
df.describe()

## Valores atípicos

In [5]:
import numpy as np

np.random.seed(0)
data = np.random.randint(low=0, high=11, size=1000)
data[0] = 100
data[1] = -100

In [None]:
# z-scores
z_scores = (data-np.mean(data))/np.std(data)
umbral = 3
outliers = np.where(np.abs(z_scores) > umbral)
data[outliers]

- **Ejemplo housing**

In [23]:
df = pd.read_csv('../data/housing.csv')
dfdel = df.dropna()

In [7]:
# Columnas numéricas
c_numericas = ['housing_median_age', 'total_rooms',
               'total_bedrooms', 'population', 'households', 'median_income',
               'median_house_value']

In [None]:
from tabulate import tabulate

df_num = dfdel[c_numericas]

z_scores = (df_num-df_num.mean())/df_num.std()
z_scores_abs = z_scores.apply(np.abs)
print(tabulate(z_scores_abs, headers='keys'))


In [None]:
umbral = 3

dfout_mask = ~z_scores[z_scores_abs > umbral].isna()
print('\nOutliers:\n')
print(dfout_mask.sum())
# print(tabulate(dfout_mask, headers='keys'))

In [None]:
# IQR
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
print('iqr:', iqr)
print(np.mean(data), q1, q3)

umbra_sup = q3+1.5*iqr
umbra_inf = q1-1.5*iqr
print('umbrales:', umbra_inf, umbra_sup)

outliers = np.where((data < umbra_inf) | (data > umbra_sup))
data[outliers]

In [None]:
dfdel.columns

In [None]:
dfdel.describe()

In [None]:
df_num = dfdel[c_numericas]

q1 = df_num.apply(lambda x: np.percentile(x, 25))
q3 = df_num.apply(lambda x: np.percentile(x, 75))

print('q1:', q1, '\n')
print('q3:', q3, '\n')


In [None]:

iqr = q3 - q1
print('iqr:', iqr, '\n')

In [None]:

umbra_sup = q3+1.5*iqr
umbra_inf = q1-1.5*iqr

print('umbrales inf:\n', umbra_inf)
print('\numbrales sup:\n', umbra_sup)

In [None]:
from tabulate import tabulate

mask = ~df_num[((df_num < umbra_inf) | (df_num > umbra_sup))].isna()
# print(mask)
print(df_num[mask].count())
print(tabulate(df_num[mask], headers='keys'))

In [None]:
df_clipped = df_num.clip(lower=umbra_inf, upper=umbra_sup, axis=1)
df_clipped

In [None]:
df_num[~mask].dropna()

### Duplicados

In [None]:
df = pd.read_csv('../data/housing.csv')
df = pd.concat([df, df.iloc[[1, 60, 6]]])
duplicate_rows = df[df.duplicated()]
print(duplicate_rows)


In [None]:
# eliminar duplicados
cleaned_df = df.drop_duplicates()
cleaned_df.duplicated().sum()

In [None]:
# Agregando duplicados
data = {'customer_id': [102, 102, 101, 103, 102], 'product_id': [
    'A', 'B', 'A', 'C', 'B'], 'quantity_sold': [5, 3, 2, 1, 4]}
df = pd.DataFrame(data)
df

In [None]:
df[{'customer_id','product_id'}].duplicated()

In [None]:
df_merged = df.groupby(['customer_id','product_id']).agg({'quantity_sold':'sum'}).reset_index()
df_merged

### Incoherencias

In [None]:
df = pd.read_csv('../data/housing.csv')
dfdel = df.dropna()
dfdel.info()

In [None]:
dfdel.columns

In [None]:
dfdel[['total_rooms', 'total_bedrooms', 'population', 'households']] = dfdel[[
    'total_rooms', 'total_bedrooms', 'population', 'households']].astype('int32')

dfdel.info()

In [None]:
# dfdel[['ocean_proximity']]=dfdel[['ocean_proximity']].astype('str')
dfdel[['ocean_proximity']]=dfdel[['ocean_proximity']].astype('category')
dfdel.info()

In [None]:
import numpy as np

df_num = df.select_dtypes(np.number)
df_num.columns

In [None]:
df_cat = df.select_dtypes(include='object')
df_cat.columns

In [None]:
df_cat['ocean_proximity'].unique()

In [50]:
import warnings
warnings.filterwarnings('ignore')

cat_values = ['NEAR BAY', 'MINUS 1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND']

df_cat['ocean_proximity'].str.match(r'[A-Z0-9 ]{4}.*').head()

df_cat['valid'] = df_cat['ocean_proximity'].isin(cat_values)

# df_cat[df_cat['valid'] == False] = 'MINUS 1H OCEAN'

df_cat['ocean_proximity'] = df_cat['ocean_proximity'].str.replace('.*H OCEAN.*', 'MINUS 1H OCEAN', regex=True)

In [None]:
df_cat['valid'] = df_cat['ocean_proximity'].isin(cat_values)

df_cat[df_cat['valid'] == False]

In [None]:
df_num[~df_num['housing_median_age'].between(1, 100)]

## Perfilado

In [3]:
import pandas as pd

df = pd.read_csv('../Pandas/datos/housing.csv')


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [7]:
df.sample(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
15876,-122.41,37.75,52.0,1919.0,404.0,1483.0,421.0,3.4063,253900.0,NEAR BAY
9018,-118.72,34.14,7.0,23866.0,4407.0,9873.0,4012.0,5.4032,318500.0,NEAR OCEAN
8443,-118.36,33.9,40.0,1271.0,276.0,725.0,234.0,5.0452,231900.0,<1H OCEAN
4889,-118.26,34.02,39.0,698.0,232.0,1046.0,228.0,2.2356,119500.0,<1H OCEAN
19540,-120.95,37.64,32.0,3487.0,740.0,1957.0,685.0,2.7209,88300.0,INLAND
4655,-118.33,34.05,44.0,1574.0,390.0,1323.0,404.0,2.5284,226300.0,<1H OCEAN
15150,-116.84,32.92,20.0,1066.0,219.0,536.0,173.0,3.1607,119300.0,<1H OCEAN
1024,-120.0,38.52,16.0,3045.0,543.0,202.0,102.0,3.15,140600.0,INLAND
828,-122.08,37.62,27.0,1826.0,309.0,1016.0,313.0,5.64,206500.0,NEAR BAY
14093,-117.11,32.76,19.0,2188.0,616.0,1304.0,607.0,2.0852,114400.0,NEAR OCEAN


In [8]:
df.shape

(20640, 10)

In [9]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [10]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [11]:
df.describe(include='object')

Unnamed: 0,ocean_proximity
count,20640
unique,5
top,<1H OCEAN
freq,9136


In [12]:
df_group = df.groupby('ocean_proximity')['population'].mean()
df_group

ocean_proximity
<1H OCEAN     1520.290499
INLAND        1391.046252
ISLAND         668.000000
NEAR BAY      1230.317467
NEAR OCEAN    1354.008653
Name: population, dtype: float64

In [13]:
df_group = df.groupby('ocean_proximity')['median_income'].mean()
df_group

ocean_proximity
<1H OCEAN     4.230682
INLAND        3.208996
ISLAND        2.744420
NEAR BAY      4.172885
NEAR OCEAN    4.005785
Name: median_income, dtype: float64

In [14]:
df.sort_values(by='median_income', ascending=False)[[
    'median_income','total_rooms', 'total_bedrooms', 'population', 'households']].head()

Unnamed: 0,median_income,total_rooms,total_bedrooms,population,households
4352,15.0001,407.0,67.0,100.0,47.0
10673,15.0001,5192.0,658.0,1865.0,662.0
8849,15.0001,3815.0,439.0,1266.0,413.0
4606,15.0001,1482.0,171.0,531.0,161.0
5257,15.0001,2861.0,360.0,829.0,310.0


In [15]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [16]:
df['housing_median_age'].value_counts()

52.0    1273
36.0     862
35.0     824
16.0     771
17.0     698
34.0     689
26.0     619
33.0     615
18.0     570
25.0     566
32.0     565
37.0     537
15.0     512
19.0     502
27.0     488
24.0     478
30.0     476
28.0     471
20.0     465
29.0     461
31.0     458
23.0     448
21.0     446
14.0     412
22.0     399
38.0     394
39.0     369
42.0     368
44.0     356
43.0     353
40.0     304
13.0     302
41.0     296
45.0     294
10.0     264
11.0     254
46.0     245
5.0      244
12.0     238
8.0      206
9.0      205
47.0     198
4.0      191
48.0     177
7.0      175
6.0      160
50.0     136
49.0     134
3.0       62
2.0       58
51.0      48
1.0        4
Name: housing_median_age, dtype: int64

In [17]:
df.corr(method='pearson')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0


In [18]:
df.skew()

  df.skew()


longitude            -0.297801
latitude              0.465953
housing_median_age    0.060331
total_rooms           4.147343
total_bedrooms        3.459546
population            4.935858
households            3.410438
median_income         1.646657
median_house_value    0.977763
dtype: float64

In [19]:
df.kurt()

  df.kurt()


longitude             -1.330152
latitude              -1.117760
housing_median_age    -0.800629
total_rooms           32.630927
total_bedrooms        21.985575
population            73.553116
households            22.057988
median_income          4.952524
median_house_value     0.327870
dtype: float64

In [20]:
cat_poblacion = ['muy pequeño', 'pqueño', 'media', 'grande', 'muy grande']
df['cat_poblacion'] = pd.qcut(df['population'], 5, cat_poblacion)

df['cat_poblacion'].value_counts()

grande         4135
muy pequeño    4133
pqueño         4133
muy grande     4121
media          4118
Name: cat_poblacion, dtype: int64