# Análisis calidad del aire

La preocupación por la calidad del aire en ciudades como Madrid ha impulsado la búsqueda de soluciones eficientes para purificar el aire en interiores. Utilizando la plataforma de datos abiertos Kaggle, hemos accedido a información valiosa sobre calidad del aire y purificadores disponibles en el mercado.

Esta información nos ha permitido desarrollar Clean Air, un purificador de aire innovador y ecológico, diseñado para áreas urbanas. Basándonos en datos de Kaggle, hemos identificado características relevantes, establecido una estrategia de precios y distribución adecuada, y elaborado una estrategia de marketing efectiva y dirigida a nuestro público objetivo.

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [5]:
from matplotlib.pyplot import rcParams

rcParams['figure.figsize'] = 15, 8  
rcParams["font.weight"] = "bold"
rcParams["font.size"] = 12 
rcParams["axes.labelweight"] = "bold"


### Cargar los datos para el modelo

In [6]:
csv_filename = "sep_mo19.csv"

df = pd.read_csv(csv_filename, sep=';', header=0)
df.head(5000)

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,PUNTO_MUESTREO,ANO,MES,DIA,H01,V01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
0,28,79,4,1,28079004_1_38,2019,9,1,9.00,V,...,6.00,V,7.00,V,6.00,V,7.00,V,7.00,V
1,28,79,4,1,28079004_1_38,2019,9,2,7.00,V,...,8.00,V,7.00,V,7.00,V,7.00,V,8.00,V
2,28,79,4,1,28079004_1_38,2019,9,3,8.00,V,...,7.00,V,7.00,V,6.00,V,7.00,V,7.00,V
3,28,79,4,1,28079004_1_38,2019,9,4,8.00,V,...,6.00,V,7.00,V,7.00,V,7.00,V,7.00,V
4,28,79,4,1,28079004_1_38,2019,9,5,7.00,V,...,6.00,V,7.00,V,8.00,V,7.00,V,8.00,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4582,28,79,60,14,28079060_14_6,2019,9,26,60.85,V,...,104.40,V,74.65,V,21.67,V,27.68,V,26.59,V
4583,28,79,60,14,28079060_14_6,2019,9,27,10.36,V,...,99.09,V,71.81,V,50.46,V,32.17,V,19.29,V
4584,28,79,60,14,28079060_14_6,2019,9,28,34.88,V,...,115.70,V,70.17,V,20.11,V,16.91,V,17.97,V
4585,28,79,60,14,28079060_14_6,2019,9,29,3.80,V,...,111.40,V,80.90,V,61.81,V,28.93,V,32.80,V


In [7]:
csv_filename = "sep_mo19.csv"

df = pd.read_csv(csv_filename, sep=';', header=0)

missing_values = df.isna().sum()
print(missing_values)

PROVINCIA         0
MUNICIPIO         0
ESTACION          0
MAGNITUD          0
PUNTO_MUESTREO    0
ANO               0
MES               0
DIA               0
H01               0
V01               0
H02               0
V02               0
H03               0
V03               0
H04               0
V04               0
H05               0
V05               0
H06               0
V06               0
H07               0
V07               0
H08               0
V08               0
H09               0
V09               0
H10               0
V10               0
H11               0
V11               0
H12               0
V12               0
H13               0
V13               0
H14               0
V14               0
H15               0
V15               0
H16               0
V16               0
H17               0
V17               0
H18               0
V18               0
H19               0
V19               0
H20               0
V20               0
H21               0
V21               0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4587 entries, 0 to 4586
Data columns (total 56 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PROVINCIA       4587 non-null   int64  
 1   MUNICIPIO       4587 non-null   int64  
 2   ESTACION        4587 non-null   int64  
 3   MAGNITUD        4587 non-null   int64  
 4   PUNTO_MUESTREO  4587 non-null   object 
 5   ANO             4587 non-null   int64  
 6   MES             4587 non-null   int64  
 7   DIA             4587 non-null   int64  
 8   H01             4587 non-null   float64
 9   V01             4587 non-null   object 
 10  H02             4587 non-null   float64
 11  V02             4587 non-null   object 
 12  H03             4587 non-null   float64
 13  V03             4587 non-null   object 
 14  H04             4587 non-null   float64
 15  V04             4587 non-null   object 
 16  H05             4587 non-null   float64
 17  V05             4587 non-null   o

In [9]:
df[df.isna().any(axis=1)]    # No se encuentran filas con valores nulos

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,PUNTO_MUESTREO,ANO,MES,DIA,H01,V01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24


In [10]:
df_limpiar = df.dropna().reset_index(drop=True)
df_limpiar

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,PUNTO_MUESTREO,ANO,MES,DIA,H01,V01,...,H20,V20,H21,V21,H22,V22,H23,V23,H24,V24
0,28,79,4,1,28079004_1_38,2019,9,1,9.00,V,...,6.00,V,7.00,V,6.00,V,7.00,V,7.00,V
1,28,79,4,1,28079004_1_38,2019,9,2,7.00,V,...,8.00,V,7.00,V,7.00,V,7.00,V,8.00,V
2,28,79,4,1,28079004_1_38,2019,9,3,8.00,V,...,7.00,V,7.00,V,6.00,V,7.00,V,7.00,V
3,28,79,4,1,28079004_1_38,2019,9,4,8.00,V,...,6.00,V,7.00,V,7.00,V,7.00,V,7.00,V
4,28,79,4,1,28079004_1_38,2019,9,5,7.00,V,...,6.00,V,7.00,V,8.00,V,7.00,V,8.00,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4582,28,79,60,14,28079060_14_6,2019,9,26,60.85,V,...,104.40,V,74.65,V,21.67,V,27.68,V,26.59,V
4583,28,79,60,14,28079060_14_6,2019,9,27,10.36,V,...,99.09,V,71.81,V,50.46,V,32.17,V,19.29,V
4584,28,79,60,14,28079060_14_6,2019,9,28,34.88,V,...,115.70,V,70.17,V,20.11,V,16.91,V,17.97,V
4585,28,79,60,14,28079060_14_6,2019,9,29,3.80,V,...,111.40,V,80.90,V,61.81,V,28.93,V,32.80,V


### Análisis Exploratorio Inicial, Tratamiento y Limpieza de datos

In [11]:
df.describe()

Unnamed: 0,PROVINCIA,MUNICIPIO,ESTACION,MAGNITUD,ANO,MES,DIA,H01,H02,H03,...,H15,H16,H17,H18,H19,H20,H21,H22,H23,H24
count,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,...,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0,4587.0
mean,28.0,79.0,34.911053,13.088075,2019.0,9.0,15.500327,23.987122,20.080656,17.176529,...,19.592167,18.440647,18.206263,18.746791,19.44092,21.193255,25.469623,29.46695,27.822832,26.415104
std,0.0,0.0,17.980072,10.442796,0.0,0.0,8.659206,41.080207,32.824667,27.857518,...,28.790633,26.390228,27.223126,27.54834,27.292156,27.718621,34.723097,46.094758,43.845117,46.120163
min,28.0,79.0,4.0,1.0,2019.0,9.0,1.0,0.0,0.0,0.0,...,0.01,0.02,0.0,0.0,0.0,0.0,0.0,-0.01,0.0,0.0
25%,28.0,79.0,18.0,7.0,2019.0,9.0,8.0,1.105,1.0,1.0,...,1.7,1.49,1.395,1.4,1.37,1.21,1.21,1.5,1.45,1.235
50%,28.0,79.0,38.0,10.0,2019.0,9.0,16.0,9.0,8.0,7.0,...,9.0,8.0,8.0,8.0,9.0,9.0,11.0,12.0,11.0,10.0
75%,28.0,79.0,54.0,14.0,2019.0,9.0,23.0,29.0,24.0,20.07,...,25.0,22.0,21.0,22.0,24.0,30.5,40.0,41.0,37.0,31.0
max,28.0,79.0,60.0,44.0,2019.0,9.0,30.0,491.0,380.0,404.0,...,759.0,152.2,151.7,158.0,165.4,232.0,460.0,781.0,616.0,796.0
