In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
file_path = "/updated_pollution_dataset.csv"
air_data = pd.read_csv(file_path)
air_data.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,Good
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,Good


In [3]:
air_data['record_ID'] = air_data.index
air_data.rename(columns={'Proximity_to_Industrial_Areas': 'industry_Proximity', 'Air Quality': 'air_Quality','Population_Density': 'population_Density', 'Temperature': 'temp','Humidity': 'humidity'}, inplace=True)
air_data = air_data[['record_ID', 'temp', 'humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO','industry_Proximity', 'population_Density', 'air_Quality']]
air_data.drop_duplicates(inplace=True)
air_data.dropna(inplace=True)
air_data.head()

Unnamed: 0,record_ID,temp,humidity,PM2.5,PM10,NO2,SO2,CO,industry_Proximity,population_Density,air_Quality
0,0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate
3,3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,Good
4,4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,Good


In [4]:
air_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   record_ID           5000 non-null   int64  
 1   temp                5000 non-null   float64
 2   humidity            5000 non-null   float64
 3   PM2.5               5000 non-null   float64
 4   PM10                5000 non-null   float64
 5   NO2                 5000 non-null   float64
 6   SO2                 5000 non-null   float64
 7   CO                  5000 non-null   float64
 8   industry_Proximity  5000 non-null   float64
 9   population_Density  5000 non-null   int64  
 10  air_Quality         5000 non-null   object 
dtypes: float64(8), int64(2), object(1)
memory usage: 429.8+ KB


In [5]:
label = air_data['air_Quality']
features = air_data.drop('air_Quality', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [7]:
best_score = 0
best_n = 0
for n in range(1, 11):
  model = KNeighborsClassifier(n_neighbors = n)
  model.fit(X_train, y_train)
  score = model.score(X_train, y_train)
  if score > best_score:
    best_score = score
    best_n = n
print(f'Best n: {best_n}')
print(f'Best score: {best_score}')

Best n: 1
Best score: 1.0


In [8]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_train, y_train)

1.0

In [9]:
file_path = "/Measurement_summary.csv"
air_big_data = pd.read_csv(file_path)
air_big_data.head()

Unnamed: 0,Measurement date,Station code,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5
0,2017-01-01 00:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0
1,2017-01-01 01:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0
2,2017-01-01 02:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0
3,2017-01-01 03:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0
4,2017-01-01 04:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0


In [10]:
air_big_data.info()
air_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227666 entries, 0 to 227665
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Measurement date  227666 non-null  object 
 1   Station code      227666 non-null  int64  
 2   Address           227665 non-null  object 
 3   Latitude          227665 non-null  float64
 4   Longitude         227665 non-null  float64
 5   SO2               227665 non-null  float64
 6   NO2               227665 non-null  float64
 7   O3                227665 non-null  float64
 8   CO                227665 non-null  float64
 9   PM10              227665 non-null  float64
 10  PM2.5             227665 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 19.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----

In [11]:
air_big_data['record_ID'] = air_big_data.index
seoul_air_shaped = air_big_data[['record_ID','SO2','NO2','PM2.5','PM10', 'CO']]
pollution_shaped = air_data[['record_ID', 'SO2', 'NO2', 'PM2.5', 'PM10', 'CO', 'air_Quality']]

In [12]:
label = pollution_shaped['air_Quality']
features = pollution_shaped.drop('air_Quality', axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_train, y_train)
predictions = model.predict(seoul_air_shaped)
air_big_data['air_Quality'] = predictions

In [13]:
seoul_air_shaped.sample(5, random_state= 4)

Unnamed: 0,record_ID,SO2,NO2,PM2.5,PM10,CO
203513,203513,0.005,0.014,19.0,19.0,0.7
181127,181127,0.005,0.051,48.0,62.0,1.1
147961,147961,0.004,0.021,13.0,36.0,0.4
30924,30924,0.002,0.016,12.0,19.0,0.4
15322,15322,0.003,0.01,10.0,17.0,0.4


In [14]:
pollution_shaped.sample(5)

Unnamed: 0,record_ID,SO2,NO2,PM2.5,PM10,CO,air_Quality
407,407,5.2,20.3,9.7,14.7,1.07,Good
1430,1430,18.9,27.6,5.9,21.1,1.82,Poor
1161,1161,17.0,39.2,2.2,7.8,2.0,Poor
1031,1031,9.9,26.3,17.5,22.8,1.61,Moderate
2817,2817,5.2,30.4,10.0,18.0,1.57,Moderate


In [15]:
#air_big_data.to_csv('seoul_data.csv', index=False)

In [16]:
air_big_data.sample(15)

Unnamed: 0,Measurement date,Station code,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,record_ID,air_Quality
146031,2018-11-20 19:00,106,"10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o...",37.55558,126.905597,0.006,0.045,0.007,0.5,63.0,17.0,146031,Good
173865,2019-02-09 05:00,107,"18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re...",37.541864,127.049659,0.003,0.01,0.029,0.4,30.0,13.0,173865,Good
98704,2019-06-05 20:00,104,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",37.609823,126.934848,0.005,0.056,0.038,0.5,93.0,64.0,98704,Good
110070,2017-09-27 04:00,105,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",37.593742,126.949679,0.006,0.021,0.039,0.5,52.0,33.0,110070,Moderate
200202,2019-02-27 19:00,108,"571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi...",37.54718,127.092493,0.009,0.051,0.02,0.9,68.0,47.0,200202,Good
146372,2018-12-05 00:00,106,"10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o...",37.55558,126.905597,0.003,0.016,0.02,0.3,26.0,3.0,146372,Good
163296,2017-11-25 19:00,107,"18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re...",37.541864,127.049659,0.005,0.044,0.005,0.5,14.0,7.0,163296,Good
167678,2018-05-27 09:00,107,"18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re...",37.541864,127.049659,0.007,0.023,0.036,0.4,71.0,31.0,167678,Good
13880,2018-08-02 08:00,101,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008,0.004,0.027,0.028,0.4,21.0,13.0,13880,Good
199431,2019-01-26 16:00,108,"571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi...",37.54718,127.092493,0.004,0.025,0.022,0.7,47.0,17.0,199431,Moderate


In [17]:
air_big_data.groupby('Station code').value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,count
Station code,Measurement date,Address,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,record_ID,air_Quality,Unnamed: 13_level_1
101,2017-01-01 00:00,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republic of Korea",37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,0,Good,1
101,2017-01-01 01:00,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republic of Korea",37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0,1,Good,1
101,2017-01-01 02:00,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republic of Korea",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0,2,Good,1
101,2017-01-01 03:00,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republic of Korea",37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0,3,Good,1
101,2017-01-01 04:00,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republic of Korea",37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0,4,Good,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,2019-05-13 06:00,"43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul, Republic of Korea",37.575743,127.028885,0.005,0.061,0.006,0.7,60.0,43.0,227660,Good,1
109,2019-05-13 07:00,"43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul, Republic of Korea",37.575743,127.028885,0.005,0.059,0.011,0.8,60.0,43.0,227661,Good,1
109,2019-05-13 08:00,"43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul, Republic of Korea",37.575743,127.028885,0.005,0.062,0.019,0.7,65.0,45.0,227662,Good,1
109,2019-05-13 09:00,"43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul, Republic of Korea",37.575743,127.028885,0.005,0.059,0.028,0.6,58.0,39.0,227663,Good,1
