## **AIR QUALITY**

### **LIBRARIES**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

### **LOAD DATA**

In [2]:
train = pd.read_csv('./data/train.csv')

In [3]:
test = pd.read_csv('./data/test.csv')

### **EDA**

In [None]:
train.head()

Unnamed: 0,ID,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name,CO2
0,ID_000001,28.975,74.475,2480.0,3476.5,1572.0,1997.0,alpha,585.75
1,ID_000002,31.9,66.5,3813.0,2726.0,4145.0,3180.0,alpha,613.0
2,ID_000003,31.675,60.015,2811.0,1563.5,4250.0,2708.5,alpha,616.5
3,ID_000004,31.58,59.22,2844.0,1597.0,4310.0,2723.0,alpha,642.5
4,ID_000005,31.69,62.03,3159.5,1120.5,5519.5,1219.0,alpha,622.0


In [None]:
test.head()

Unnamed: 0,ID,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name
0,ID_007308,32.45,59.99,3504.0,1380.0,2642.5,1637.0,alpha
1,ID_007309,31.665,58.64,2864.0,1659.0,4456.0,3564.5,alpha
2,ID_007310,31.755,59.0,2850.5,1615.0,4391.0,2683.5,alpha
3,ID_007311,31.86,66.85,3853.5,2868.5,4170.5,3025.5,alpha
4,ID_007312,29.6,73.115,2506.5,3631.5,1481.5,2112.5,alpha


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7307 entries, 0 to 7306
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            7307 non-null   object 
 1   Temperature   7307 non-null   float64
 2   Humidity      7307 non-null   float64
 3   MQ7_analog    7307 non-null   float64
 4   MQ9_analog    7307 non-null   float64
 5   MG811_analog  7307 non-null   float64
 6   MQ135_analog  7307 non-null   float64
 7   device_name   7307 non-null   object 
 8   CO2           7307 non-null   float64
dtypes: float64(7), object(2)
memory usage: 513.9+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292 entries, 0 to 1291
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            1292 non-null   object 
 1   Temperature   1292 non-null   float64
 2   Humidity      1292 non-null   float64
 3   MQ7_analog    1292 non-null   float64
 4   MQ9_analog    1292 non-null   float64
 5   MG811_analog  1292 non-null   float64
 6   MQ135_analog  1292 non-null   float64
 7   device_name   1292 non-null   object 
dtypes: float64(6), object(2)
memory usage: 80.9+ KB


In [8]:
train.describe()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
count,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0,7307.0
mean,29.681726,69.593742,4230.820902,3976.708841,3995.126568,3444.78824,611.634608
std,2.016785,7.248136,1085.0081,1603.972744,1098.740604,893.426679,16.540953
min,23.44,52.2,2380.0,1098.5,1353.5,1186.5,573.166667
25%,28.49,65.665,3362.5,2832.5,3181.0,2912.5,600.0
50%,29.29,70.555,4061.0,4073.0,4137.0,3586.5,608.0
75%,31.7175,74.35,4780.75,5286.166667,4731.0,4143.75,621.0
max,33.85,93.525,9545.5,10379.5,7919.0,6257.0,677.0


In [9]:
test.describe()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog
count,1292.0,1292.0,1292.0,1292.0,1292.0,1292.0
mean,29.686183,69.436179,4188.735423,3921.64822,3941.450206,3399.566047
std,2.006444,7.200404,1078.636054,1602.971452,1074.327352,897.210017
min,23.48,52.235,2399.5,1100.0,1353.0,1195.0
25%,28.43875,65.10625,3348.25,2776.625,3178.375,2892.25
50%,29.2725,70.6525,4041.5,4060.0,4118.5,3580.0
75%,31.75,74.26,4732.625,5220.125,4640.5,4108.75
max,33.82,92.625,9414.5,10433.0,7272.0,6285.0


In [10]:
train['device_name'].value_counts()

device_name
beta       2485
charlie    2431
alpha      2391
Name: count, dtype: int64

In [11]:
test['device_name'].value_counts()

device_name
beta       439
charlie    430
alpha      423
Name: count, dtype: int64

In [12]:
train.groupby('device_name')[['MQ7_analog', 'MQ9_analog','MG811_analog', 'MQ135_analog']].mean().reset_index()

Unnamed: 0,device_name,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog
0,alpha,3642.131256,2220.223895,4289.467447,2559.380385
1,beta,3990.842321,4502.818042,3411.655131,4006.544601
2,charlie,5055.133416,5166.496641,4302.060949,3741.392774


### **ENCODE DEVICE NAME**

In [13]:
device_map = {'alpha': 0, 'beta': 1, 'charlie': 2}

In [14]:
train['device_name'] = train['device_name'].map(device_map)
test['device_name'] = test['device_name'].map(device_map)

### **MODEL BUILDING**

In [15]:
X = train.drop(['ID', 'CO2'], axis=1)
y = train['CO2']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
scaler = StandardScaler()

In [18]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### **RANDOMFOREST**

In [19]:
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

In [20]:
y_pred = rf.predict(X_test)

In [21]:
np.sqrt(mean_squared_error(y_test, y_pred))

5.028927157157564

### **SUBMISSION**

In [22]:
testpred = rf.predict(test.drop('ID', axis=1))



In [28]:
submission = pd.DataFrame(
    {
        'ID': test['ID'],
        'Target': np.nan
    }
)

In [29]:
submission['Target'] = testpred
submission.head()

Unnamed: 0,ID,Target
0,ID_007308,609.636667
1,ID_007309,609.636667
2,ID_007310,609.636667
3,ID_007311,609.636667
4,ID_007312,609.636667
