## Import necessary utilities

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import xgboost
import shap
from sklearn.metrics import accuracy_score
import pickle
import itertools
import threading
import time
import sys
from sklearn.ensemble import RandomForestClassifier as RF

In [2]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    #df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

## Train/Test Postcode Spliting

In [3]:
def loadData():
    
    #gwtp_SO2 = pd.read_csv('gwtp_SO2.csv')
    gwtp = pd.read_csv('gwtp.csv')
    gwtp_O3 = pd.read_csv('gwtp_O3.csv')
    gwtp_NO2 = pd.read_csv('gwtp_NO2.csv')
    gwtp_PM10 = pd.read_csv('gwtp_PM10.csv')
    gwtp_PM25 = pd.read_csv('gwtp_PM25.csv')
    return gwtp,gwtp_O3,gwtp_NO2,gwtp_SO2,gwtp_PM10,gwtp_PM25
    #return gwtp,gwtp_O3,gwtp_NO2,gwtp_PM10,gwtp_PM25

def postcodeSplit(data):
    #all_postcodes = data.postcode.unique().tolist()
    all_postcodes = data.postcode.unique().tolist()
    test_postcodes = all_postcodes[0:3]
    valid_postcodes = all_postcodes[4:9]
    return test_postcodes,valid_postcodes

# Model Building

### Data Prepration

In [4]:
def stringToNumeric(data):
    dummy_main = pd.get_dummies(data.main)
    dummy_description = pd.get_dummies(data.description)
    dummy_icon = pd.get_dummies(data.icon)
    dummy_dayoftheweek= pd.get_dummies(data.dayoftheweek)
    dummy_frc= pd.get_dummies(data.frc)
    data_merged = pd.concat([data, dummy_main,dummy_description,dummy_icon,dummy_dayoftheweek,dummy_frc], axis='columns')
    data_merged=data_merged.drop(['main', 'description','icon','dayoftheweek','frc'], axis='columns')
    data_merged=data_merged.drop(['latitude4', 'longitude4'], axis=1)
    return data_merged

In [5]:
def aqiClasses(pollutant,data):
    if pollutant == 'O3':
        pollutant_data = data[data[pollutant].notna()].reset_index(drop=True)
        pollutant_data.loc[pollutant_data[pollutant] <=33, 'AQI'] = 1
        pollutant_data.loc[(pollutant_data[pollutant]>33)&(pollutant_data[pollutant]<=66), 'AQI'] = 2
        pollutant_data.loc[(pollutant_data[pollutant]>66)&(pollutant_data[pollutant]<=100), 'AQI'] = 3
        pollutant_data.loc[(pollutant_data[pollutant]>100)&(pollutant_data[pollutant]<=120), 'AQI'] = 4
        pollutant_data.loc[(pollutant_data[pollutant]>120)&(pollutant_data[pollutant]<=140), 'AQI'] = 5
        pollutant_data.loc[(pollutant_data[pollutant]>140)&(pollutant_data[pollutant]<=160), 'AQI'] = 6
        pollutant_data.loc[(pollutant_data[pollutant]>160)&(pollutant_data[pollutant]<=187), 'AQI'] = 7
        pollutant_data.loc[(pollutant_data[pollutant]>187)&(pollutant_data[pollutant]<=213), 'AQI'] = 8
        pollutant_data.loc[(pollutant_data[pollutant]>213)&(pollutant_data[pollutant]<=240), 'AQI'] = 9
        pollutant_data.loc[pollutant_data[pollutant] >240, 'AQI'] = 10
    elif pollutant == 'SO2':
        pollutant_data = data[data[pollutant].notna()].reset_index(drop=True)
        pollutant_data.loc[pollutant_data[pollutant] <=88, 'AQI'] = 1
        pollutant_data.loc[(pollutant_data[pollutant]>67)&(pollutant_data[pollutant]<=177), 'AQI'] = 2
        pollutant_data.loc[(pollutant_data[pollutant]>177)&(pollutant_data[pollutant]<=266), 'AQI'] = 3
        pollutant_data.loc[(pollutant_data[pollutant]>266)&(pollutant_data[pollutant]<=354), 'AQI'] = 4
        pollutant_data.loc[(pollutant_data[pollutant]>354)&(pollutant_data[pollutant]<=443), 'AQI'] = 5
        pollutant_data.loc[(pollutant_data[pollutant]>443)&(pollutant_data[pollutant]<=532), 'AQI'] = 6
        pollutant_data.loc[(pollutant_data[pollutant]>532)&(pollutant_data[pollutant]<=710), 'AQI'] = 7
        pollutant_data.loc[(pollutant_data[pollutant]>710)&(pollutant_data[pollutant]<=887), 'AQI'] = 8
        pollutant_data.loc[(pollutant_data[pollutant]>887)&(pollutant_data[pollutant]<=1064), 'AQI'] = 9
        pollutant_data.loc[pollutant_data[pollutant] >1064, 'AQI'] = 10
    elif pollutant == 'NO2': 
        pollutant_data = data[data[pollutant].notna()].reset_index(drop=True)
        pollutant_data.loc[pollutant_data[pollutant] <=67, 'AQI'] = 1
        pollutant_data.loc[(pollutant_data[pollutant]>67)&(pollutant_data[pollutant]<=134), 'AQI'] = 2
        pollutant_data.loc[(pollutant_data[pollutant]>134)&(pollutant_data[pollutant]<=200), 'AQI'] = 3
        pollutant_data.loc[(pollutant_data[pollutant]>201)&(pollutant_data[pollutant]<=267), 'AQI'] = 4
        pollutant_data.loc[(pollutant_data[pollutant]>267)&(pollutant_data[pollutant]<=334), 'AQI'] = 5
        pollutant_data.loc[(pollutant_data[pollutant]>334)&(pollutant_data[pollutant]<=400), 'AQI'] = 6
        pollutant_data.loc[(pollutant_data[pollutant]>400)&(pollutant_data[pollutant]<=467), 'AQI'] = 7
        pollutant_data.loc[(pollutant_data[pollutant]>467)&(pollutant_data[pollutant]<=534), 'AQI'] = 8
        pollutant_data.loc[(pollutant_data[pollutant]>534)&(pollutant_data[pollutant]<=600), 'AQI'] = 9
        pollutant_data.loc[pollutant_data[pollutant] >600, 'AQI'] = 10
    elif pollutant == 'PM10':
        pollutant_data = data[data[pollutant].notna()].reset_index(drop=True)
        pollutant_data.loc[pollutant_data[pollutant] <=16, 'AQI'] = 1
        pollutant_data.loc[(pollutant_data[pollutant]>16)&(pollutant_data[pollutant]<=33), 'AQI'] = 2
        pollutant_data.loc[(pollutant_data[pollutant]>33)&(pollutant_data[pollutant]<=50), 'AQI'] = 3
        pollutant_data.loc[(pollutant_data[pollutant]>50)&(pollutant_data[pollutant]<=58), 'AQI'] = 4
        pollutant_data.loc[(pollutant_data[pollutant]>58)&(pollutant_data[pollutant]<=66), 'AQI'] = 5
        pollutant_data.loc[(pollutant_data[pollutant]>66)&(pollutant_data[pollutant]<=75), 'AQI'] = 6
        pollutant_data.loc[(pollutant_data[pollutant]>75)&(pollutant_data[pollutant]<=83), 'AQI'] = 7
        pollutant_data.loc[(pollutant_data[pollutant]>83)&(pollutant_data[pollutant]<=91), 'AQI'] = 8
        pollutant_data.loc[(pollutant_data[pollutant]>91)&(pollutant_data[pollutant]<=100), 'AQI'] = 9
        pollutant_data.loc[pollutant_data[pollutant] >100, 'AQI'] = 10
    elif pollutant == 'PM25':
        pollutant_data = data[data[pollutant].notna()].reset_index(drop=True)
        pollutant_data.loc[pollutant_data[pollutant] <=11, 'AQI'] = 1
        pollutant_data.loc[(pollutant_data[pollutant]>11)&(pollutant_data[pollutant]<=23), 'AQI'] = 2
        pollutant_data.loc[(pollutant_data[pollutant]>23)&(pollutant_data[pollutant]<=35), 'AQI'] = 3
        pollutant_data.loc[(pollutant_data[pollutant]>35)&(pollutant_data[pollutant]<=41), 'AQI'] = 4
        pollutant_data.loc[(pollutant_data[pollutant]>41)&(pollutant_data[pollutant]<=47), 'AQI'] = 5
        pollutant_data.loc[(pollutant_data[pollutant]>47)&(pollutant_data[pollutant]<=53), 'AQI'] = 6
        pollutant_data.loc[(pollutant_data[pollutant]>53)&(pollutant_data[pollutant]<=58), 'AQI'] = 7
        pollutant_data.loc[(pollutant_data[pollutant]>58)&(pollutant_data[pollutant]<=64), 'AQI'] = 8
        pollutant_data.loc[(pollutant_data[pollutant]>64)&(pollutant_data[pollutant]<=70), 'AQI'] = 9
        pollutant_data.loc[pollutant_data[pollutant] >71, 'AQI'] = 10
    
    return pollutant_data



### Data Spliting 

In [6]:
def dataSpliting(data,pollutant,test_pc,valid_pc):
    test_postcodes = pd.DataFrame(test_pc)       #pd.DataFrame(data.postcode.unique()).sample(3)

    X_test = data[data.postcode.isin(test_postcodes[test_postcodes.columns[0]].values.tolist()) ].reset_index(drop=True)
    #print('X_test: ',len(X_test .postcode.unique()), 'Samples: ', len(X_test) )
    #print(X_test.postcode.unique())
    X_test_copy = X_test[['postcode','datetime']]
    X_test=X_test.drop(['postcode',pollutant,'dt','datetime'], axis=1)
    #print('X_test Got Nan in',X_test.columns[X_test.isna().any()].tolist())
    for i in X_test.columns[X_test.isna().any()].tolist():
        X_test[i] = X_test[i].fillna(0)
    X_test = clean_dataset(X_test).reset_index(drop=True)
    y_test = X_test[['AQI']]

    X_test=X_test.drop(['AQI'], axis=1)


    X = data[~data['postcode'].isin(test_postcodes[test_postcodes.columns[0]].values.tolist()) ]
    valid_postcodes = pd.DataFrame(valid_pc) #pd.DataFrame(X.postcode.unique()).sample(5)

    X_valid = X[X['postcode'].isin(valid_postcodes[valid_postcodes.columns[0]].values.tolist()) ].reset_index(drop=True)
    #print('X_valid: ',len(X_valid.postcode.unique()), 'Samples: ', len(X_valid)  )
    #print(X_valid.postcode.unique())
    X_valid_copy = X_valid[['postcode','datetime']]
    X_valid=X_valid.drop(['postcode',pollutant,'dt','datetime'], axis=1)
    #print('X_valid Got Nan in',X_valid.columns[X_valid.isna().any()].tolist())
    for i in X_valid.columns[X_valid.isna().any()].tolist():
        X_valid[i] = X_valid[i].fillna(0)
        #X_valid[i] = X_valid[i].fillna(X_valid[i].mean()) #mui
    X_valid  = clean_dataset(X_valid).reset_index(drop=True)
    y_valid = X_valid[['AQI']]
    X_valid=X_valid.drop(['AQI'], axis=1)

    X_train = X[~X['postcode'].isin(valid_postcodes[valid_postcodes.columns[0]].values.tolist()) ].reset_index(drop=True)
    #print('X_train: ',len(X_train.postcode.unique()), 'Samples: ', len(X_train)  )
    #print(X_train.postcode.unique())
    X_train =X_train.drop(['postcode',pollutant,'dt','datetime'], axis=1)
    #print('X_train Got Nan in',X_train.columns[X_train.isna().any()].tolist())
    for i in X_train.columns[X_train.isna().any()].tolist():
        X_train[i] = X_train[i].fillna(0)
    X_train  = clean_dataset(X_train).reset_index(drop=True)
    y_train = X_train[['AQI']]
    X_train =X_train.drop(['AQI'], axis=1)
    return X_test,y_test,X_train,y_train,X_valid,y_valid,X_valid_copy,X_test_copy


### Best Feature Selection 

In [7]:
def featureSelection(X_train,y_train,X_valid,X_test):

    shap.initjs()

    model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X_train, label=y_train), 100)

    explainer = shap.TreeExplainer(model,X_train)

    shap_values = explainer.shap_values(X_train)

    def shapley_feature_ranking(shap_values, X):
        feature_order = np.argsort(np.mean(np.abs(shap_values), axis=0))
        return pd.DataFrame(
            {
                "features": [X.columns[i] for i in feature_order][::-1],
                "importance": [
                    np.mean(np.abs(shap_values), axis=0)[i] for i in feature_order
                ][::-1],
            }
        )

    #print(shapley_feature_ranking(shap_values,X_train))
    selected_features = shapley_feature_ranking(shap_values,X_train)
    selected_features = selected_features.features[selected_features.importance >0]
    #shap.summary_plot(shap_values, X_train,max_display=len(selected_features), plot_type="bar")
    print('Selected Features')
    print(selected_features)
    train=X_train[selected_features]
    print('X_train Samples: ', len(X_train)  )
    valid=X_valid[selected_features]
    print('X_valid Samples: ', len(X_valid)  )
    test=X_test[selected_features]
    print('X_test Samples: ', len(X_test)  )
    return X_train,X_valid,X_test

### Training and Saving

In [39]:
def scaleData(train,valid,test):
    #sc = StandardScaler()
    sc = MinMaxScaler(feature_range=(0,1))#
    sc.fit_transform(train)
    X_train = sc.transform(train)
    X_valid = sc.transform(valid)
    X_test = sc.transform(test)
    return X_train,X_valid, X_test

def trainModel(X_train,y_train,pollutant):
    #clf = svm.SVC(decision_function_shape='ovo')
    clf = RF()
    clf.fit(X_train, y_train.values.ravel())
    filename = pollutant+'_trained_model.sav'
    pickle.dump(clf, open(filename, 'wb'))
    return



In [40]:
def modelRun(data,pollutant,test_postcodes,valid_postcodes):
    modified_data = stringToNumeric(data) 
    classed_data = aqiClasses(pollutant,modified_data)
    X_test,y_test,X_train,y_train,X_valid,y_valid,X_valid_copy,X_test_copy = dataSpliting(classed_data,pollutant,test_postcodes,valid_postcodes)
    #X_train,X_valid,X_test = featureSelection(X_train,y_train,X_valid,X_test)
    X_train,X_valid, X_test = scaleData(X_train,X_valid,X_test)
    trainModel(X_train,y_train,pollutant)
    return  X_test,y_test,X_valid,y_valid,X_valid_copy,X_test_copy
    

### Model Validation: I have commented out SO2. I got an error here

In [41]:
def loadTrainedModel():

    O3_model = pickle.load(open('O3_trained_model.sav', 'rb'))
    SO2_model = pickle.load(open('SO2_trained_model.sav', 'rb'))
    NO2_model = pickle.load(open('NO2_trained_model.sav', 'rb'))
    PM10_model = pickle.load(open('PM10_trained_model.sav', 'rb'))
    PM25_model = pickle.load(open('PM25_trained_model.sav', 'rb'))
    return O3_model,SO2_model,NO2_model,PM10_model,PM25_model
    #return O3_model,NO2_model,PM10_model,PM25_model


In [42]:
def validateModel(model,X_valid,y_valid,X_valid_copy):
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid.values.ravel(),y_pred )*100
    X_valid_copy['predicted_aqi'] = y_pred
    X_valid_copy.loc[X_valid_copy['predicted_aqi'] <=3, 'air_pollution'] = 'Low'
    X_valid_copy.loc[(X_valid_copy['predicted_aqi']>3)&(X_valid_copy['predicted_aqi']<=6), 'air_pollution'] = 'Moderate'
    X_valid_copy.loc[(X_valid_copy['predicted_aqi']>6)&(X_valid_copy['predicted_aqi']<=9), 'air_pollution'] = 'High'
    X_valid_copy.loc[X_valid_copy['predicted_aqi']==10, 'air_pollution'] = 'Very High'
    return X_valid_copy,accuracy



# Experiments

In [43]:
all_pollutant_data,O3,NO2,SO2,PM10,PM25 = loadData()
#all_pollutant_data,O3,NO2,PM10,PM25 = loadData()
test_postcodes,valid_postcodes = postcodeSplit(all_pollutant_data)
#test_postcodes,valid_postcodes = postcodeSplit(O3)

print('O3')
X_test_O3,y_test_O3,X_valid_O3,y_valid_O3,X_valid_copy_O3,X_test_copy_O3 = modelRun(O3,'O3',test_postcodes,valid_postcodes)
print('NO2')
X_test_NO2,y_test_NO2,X_valid_NO2,y_valid_NO2,X_valid_copy_NO2,X_test_copy_NO2 = modelRun(NO2,'NO2',test_postcodes,valid_postcodes)
print('SO2')
X_test_SO2,y_test_SO2,X_valid_SO2,y_valid_SO2,X_valid_copy_SO2,X_test_copy_SO2 = modelRun(SO2,'SO2',test_postcodes,valid_postcodes)
print('PM10')
X_test_PM10,y_test_PM10,X_valid_PM10,y_valid_PM10,X_valid_copy_PM10,X_test_copy_PM10 = modelRun(PM10,'PM10',test_postcodes,valid_postcodes)
print('PM25')
X_test_PM25,y_test_PM25,X_valid_PM25,y_valid_PM25,X_valid_copy_PM25,X_test_copy_PM25 = modelRun(PM25,'PM25',test_postcodes,valid_postcodes)


O3
NO2
PM10
PM25


In [None]:
type(X_test_03)

### Validation Results

In [44]:
def removeDuplicate(df):
    df.drop_duplicates(subset=['postcode','datetime'], keep='last').reset_index(drop=True)
    return df

In [45]:
O3_model,SO2_model,NO2_model,PM10_model,PM25_model = loadTrainedModel()
#O3_model,NO2_model,PM10_model,PM25_model = loadTrainedModel()
O3_result,O3_accuracy = validateModel(O3_model,X_valid_O3,y_valid_O3,X_valid_copy_O3)
SO2_result,SO2_accuracy = validateModel(SO2_model,X_valid_SO2,y_valid_SO2,X_valid_copy_SO2)
NO2_result,NO2_accuracy = validateModel(NO2_model,X_valid_NO2,y_valid_NO2,X_valid_copy_NO2)
PM10_result,PM10_accuracy = validateModel(PM10_model,X_valid_PM10,y_valid_PM10,X_valid_copy_PM10)
PM25_result,PM25_accuracy = validateModel(PM25_model,X_valid_PM25,y_valid_PM25,X_valid_copy_PM25)

In [None]:
print('Validation Accuracy')
print('O3: ',round(O3_accuracy, 2),'NO2: ','SO2: ',round(SO2_accuracy, 2),round(NO2_accuracy, 2),'PM10: ',round(PM10_accuracy, 2),'PM25: ',round(PM25_accuracy, 2))
print('Over ALL Accuracy: ',np.mean([O3_accuracy,SO2_accuracy,NO2_accuracy,PM10_accuracy,PM25_accuracy]))

In [50]:
print('Validation Accuracy')
print('O3: ',round(O3_accuracy, 2),'NO2: ',round(NO2_accuracy, 2),'PM10: ',round(PM10_accuracy, 2),'PM25: ',round(PM25_accuracy, 2))
print('Over ALL Accuracy: ',np.mean([O3_accuracy,NO2_accuracy,PM10_accuracy,PM25_accuracy]))

Validation Accuracy
O3:  27.07 NO2:  93.83 PM10:  83.65 PM25:  92.56
Over ALL Accuracy:  74.27610637805554


In [17]:
SO2_result = removeDuplicate(SO2_result)
NO2_result = removeDuplicate(NO2_result)
PM10_result = removeDuplicate(PM10_result)
PM25_result = removeDuplicate(PM25_result)
O3_result = removeDuplicate(O3_result)

all_results = O3_result.append(SO2_result.append(NO2_result.append(PM10_result.append(PM25_result))))
#all_results = removeDuplicate(all_results)
all_results[all_results.air_pollution == 'High']

Unnamed: 0,postcode,datetime,predicted_aqi,air_pollution
3585,CV59HS,18/05/2022 20:00,8.0,High
3586,CV59HS,18/05/2022 20:00,8.0,High
3587,CV59HS,18/05/2022 20:00,8.0,High
4305,CT11XN,23/05/2022 03:00,8.0,High
712,BS28XW,17/05/2022 08:00,7.0,High
713,BS28XW,17/05/2022 08:00,7.0,High
714,BS28XW,17/05/2022 08:00,7.0,High
715,BS28XW,17/05/2022 08:00,7.0,High
716,BS28XW,17/05/2022 08:00,7.0,High


In [65]:
#SO2_result = removeDuplicate(SO2_result)
NO2_result = removeDuplicate(NO2_result)
PM10_result = removeDuplicate(PM10_result)
PM25_result = removeDuplicate(PM25_result)
O3_result = removeDuplicate(O3_result)

#all_results = O3_result.append(NO2_result.append(PM10_result.append(PM25_result)))
all_results = pd.concat([O3_result,NO2_result,PM10_result,PM25_result], axis = 0)
#all_results = removeDuplicate(all_results)
all_results[all_results.air_pollution == 'Low']


Unnamed: 0,postcode,datetime,predicted_aqi,air_pollution
0,CT11XN,01/05/2022 03:00,3.0,Low
1,CT11XN,01/05/2022 04:00,3.0,Low
2,CT11XN,28/05/2022 01:00,1.0,Low
3,CT11XN,28/05/2022 02:00,1.0,Low
4,CT11XN,28/05/2022 03:00,1.0,Low
...,...,...,...,...
3771,CT11XN,12/05/2022 13:00,1.0,Low
3772,CT11XN,12/05/2022 13:00,1.0,Low
3773,CT11XN,12/05/2022 14:00,1.0,Low
3774,CT11XN,12/05/2022 14:00,1.0,Low


In [25]:
def getAQI(pc,dateAndTime,result):
    ap = result.air_pollution[(result.postcode == pc) & (result.datetime == dateAndTime)].values
    
    if len(ap) > 1 :
        ap = [ap[0]]
    return ap
def resultExtraction(pc,dateAndTime):
    #ap_so2 = getAQI(pc,dateAndTime,SO2_result)
    ap_no2 = getAQI(pc,dateAndTime,NO2_result)
    ap_o3 = getAQI(pc,dateAndTime,O3_result)
    ap_pm10 = getAQI(pc,dateAndTime,PM10_result)
    ap_pm25 = getAQI(pc,dateAndTime,PM25_result)
    
    #all_in= [ap_so2,ap_no2,ap_o3,ap_pm10,ap_pm25]
    all_in= [ap_no2,ap_o3,ap_pm10,ap_pm25]
    all_in= [item for sublist in all_in for item in sublist]
    if 'Very High' in all_in:
        return 'Very High'
    elif 'High' in all_in:
        return 'High'
    elif 'Moderate' in all_in:
        return 'Moderate'
    elif 'Low' in all_in:
        return 'Low'

In [26]:
def animate():
    for c in itertools.cycle(['|', '/', '-', '\\']):
        if done:
            break
        sys.stdout.write('\rAir Pollution Level Forecast in Progress ' + c)
        sys.stdout.flush()
        time.sleep(0.1)
    sys.stdout.write('\rDone!     ')



In [64]:
pc = input("Please enter Postcode:\n")
print(f'You entered {pc}')
dateAndTime = input("Please enter date and time:\n")
print(f'You entered {dateAndTime} \n')
done = False
t = threading.Thread(target=animate)
t.start()
time.sleep(1)
done = True
print('\nAir Pollution level forecast for '+pc+' at '+dateAndTime+' is '+resultExtraction(pc,dateAndTime) )
#print(type(pc))
#print(type(dateAndTime))

Please enter Postcode:
CV59HS
You entered CV59HS
Please enter date and time:
18/05/2022 20:00
You entered 18/05/2022 20:00 

Air Pollution Level Forecast in Progress /
Air Pollution level forecast for CV59HS at 18/05/2022 20:00 is Low
Done!     

In [None]:
CV59HS	18/05/2022 20:00

In [215]:
final_result = pd.DataFrame(columns=['postcode','datetime','air_pollution'])
for index, row in all_results.iterrows():
    SO2_ap = str(SO2_result.air_pollution[(SO2_result.postcode==row.postcode) &(SO2_result.datetime==row.datetime)].values)
    
    #print('start')
    #print(row)
    #print(SO2_result[(SO2_result.postcode==row.postcode)&(SO2_result.datetime==row.datetime)])
    #print(SO2_result.air_pollution[(SO2_result.postcode==row.postcode) &(SO2_result.datetime==row.datetime)])
    print(SO2_ap)
    #print('end')
    NO2_ap = str(NO2_result.air_pollution[(NO2_result.postcode==row.postcode) &(NO2_result.datetime==row.datetime)].values)
    #print(NO2_ap)
    O3_ap = str(O3_result.air_pollution[(O3_result.postcode==row.postcode) &(O3_result.datetime==row.datetime)].values)
    #print(O3_ap)
    PM10_ap = str(PM10_result.air_pollution[(PM10_result.postcode==row.postcode) &(PM10_result.datetime==row.datetime)].values)
    #print(PM10_ap)
    PM25_ap = str(PM25_result.air_pollution[(PM25_result.postcode==row.postcode) &(PM25_result.datetime==row.datetime)].values)
    #print(PM25_ap)
    ap = [SO2_ap,NO2_ap,O3_ap,PM10_ap,PM25_ap]

    if 'Very High' in ap :
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Very High'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'High' in ap:
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'High'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'Moderate' in ap:
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Moderate'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'Low' in ap:
        print('in low')
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Low'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
final_result      

NameError: name 'SO2_result' is not defined

In [266]:
ap

['[]',
 '4502    Low\n4503    Low\n4504    Low\n4505    Low\nName: air_pollution, dtype: object',
 'Series([], Name: air_pollution, dtype: object)',
 'Series([], Name: air_pollution, dtype: object)',
 '3772    Low\n3773    Low\n3774    Low\n3775    Low\nName: air_pollution, dtype: object']

In [210]:
final_result = pd.DataFrame(columns=['postcode','datetime','air_pollution'])
for index, row in all_results.iterrows():
    #SO2_ap = str(SO2_result.air_pollution[(SO2_result.postcode==row.postcode) &(SO2_result.datetime==row.datetime)].values)
    
    #print('start')
    #print(row)
    #print(SO2_result[(SO2_result.postcode==row.postcode)&(SO2_result.datetime==row.datetime)])
    #print(SO2_result.air_pollution[(SO2_result.postcode==row.postcode) &(SO2_result.datetime==row.datetime)])
    #print(SO2_ap)
    #print('end')
    NO2_ap = str(NO2_result.air_pollution[(NO2_result.postcode==row.postcode) &(NO2_result.datetime==row.datetime)].values)
    #print(NO2_ap)
    O3_ap = str(O3_result.air_pollution[(O3_result.postcode==row.postcode) &(O3_result.datetime==row.datetime)].values)
    #print(O3_ap)
    PM10_ap = str(PM10_result.air_pollution[(PM10_result.postcode==row.postcode) &(PM10_result.datetime==row.datetime)].values)
    #print(PM10_ap)
    PM25_ap = str(PM25_result.air_pollution[(PM25_result.postcode==row.postcode) &(PM25_result.datetime==row.datetime)].values)
    #print(PM25_ap)
    #ap = [SO2_ap,NO2_ap,O3_ap,PM10_ap,PM25_ap]
    ap = [NO2_ap,O3_ap,PM10_ap,PM25_ap]

    if 'Very High' in ap :
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Very High'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'High' in ap:
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'High'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'Moderate' in ap:
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Moderate'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
    elif 'Low' in ap:
        print('in low')
        final_result = final_result.append(pd.DataFrame([row.postcode,row.datetime,'Low'], columns=['postcode','datetime','air_pollution']), ignore_index=True)
final_result  

Unnamed: 0,postcode,datetime,air_pollution


In [211]:
PM25_ap

"['Low' 'Low' 'Low']"

In [212]:
NO2_result.air_pollution.unique()

array(['Low'], dtype=object)

### Model Testing

In [214]:
print('Testing')
y_pred = clf.predict(X_test)
print(y_pred)
accuracy_score(y_test.values.ravel(),y_pred )*100

Testing


NameError: name 'clf' is not defined