In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
columns={
    '3.feed_1.SO2_PPM': 'SO2_PPM',
    '3.feed_1.H2S_PPM': 'H2S_PPM',
    '3.feed_1.SIGTHETA_DEG': 'SIGTHETA_DEG',
    '3.feed_1.SONICWD_DEG': 'SONICWD_DEG@',
    '3.feed_1.SONICWS_MPH': 'SONICWS_MPH',
    '3.feed_11067.CO_PPB..3.feed_43.CO_PPB': 'CO_PPB',
    '3.feed_11067.NO2_PPB..3.feed_43.NO2_PPB': 'NO2_PPB',
    '3.feed_11067.NOX_PPB..3.feed_43.NOX_PPB': 'NOX_PPB',
    '3.feed_11067.NO_PPB..3.feed_43.NO_PPB': 'NO_PPB',
    '3.feed_11067.PM25T_UG_M3..3.feed_43.PM25T_UG_M3': 'PM2_5_',
    '3.feed_11067.SIGTHETA_DEG..3.feed_43.SIGTHETA_DEG': 'SIGTHETA_DEG',
    '3.feed_11067.SONICWD_DEG..3.feed_43.SONICWD_DEG': 'SONICWD_DEG@',
    '3.feed_11067.SONICWS_MPH..3.feed_43.SONICWS_MPH': 'SONICWS_MPH',
    '3.feed_29.PM10_UG_M3': 'PM10_',
    '3.feed_3506.PM2_5': 'PM2_5_1',
    '3.feed_3506.OZONE': 'OZONE',
    '3.feed_24.PM10_UG_M3': 'PM_10',
    '3.feed_3508.PM2_5': 'PM_2_5_2',
    '3.feed_3.PM10B_UG_M3..3.feed_3.PM10_640_UG_M3': 'PM10_1',
    '3.feed_23.CO_PPM..3.feed_23.CO_PPB': 'CO_PPM_CO_PPB',
    '3.feed_28.H2S_PPM': 'H2S_PPM',
    '3.feed_28.SO2_PPM': 'SO2_PPM',
    '3.feed_28.SIGTHETA_DEG': 'SIGTHETA_DEG',
    '3.feed_28.SONICWD_DEG': 'SONICWD_DEG@',
    '3.feed_28.SONICWS_MPH': 'SONICWS_MPH',
    '3.feed_1.PM25B_UG_M3..3.feed_1.PM25T_UG_M3..3.feed_1.PM25_640_UG_M3': 'PM2_5_3',
    '3.feed_26.OZONE_PPM': 'OZONE_PPM',
    '3.feed_26.SONICWS_MPH': 'SONICWS_MPH',
    '3.feed_26.SONICWD_DEG': 'SONICWD_DEG@',
    '3.feed_26.SIGTHETA_DEG': 'SIGTHETA_DEG',
    '3.feed_3.SO2_PPM': 'SO2_PPM',
    '3.feed_3.SONICWD_DEG': 'SONICWD_DEG@',
    '3.feed_3.SONICWS_MPH': 'SONICWS_MPH',
    '3.feed_3.SIGTHETA_DEG': 'SIGTHETA_DEG',
    '3.feed_5975.PM2_5': 'PM2_5_4',
    '3.feed_23.PM10_UG_M3': 'PM10_UG_M3',
    '3.feed_27.NO_PPB': 'NO_PPB',
    '3.feed_27.NOY_PPB': 'NOY_PPB',
    '3.feed_27.CO_PPB': 'CO_PPB',
    '3.feed_27.SO2_PPB': 'SO2_PPB',
    '3.feed_29.PM25_UG_M3..3.feed_29.PM25T_UG_M3': 'PM2_5_5',
    '3.feed_26.PM25B_UG_M3..3.feed_26.PM25T_UG_M3..3.feed_59665.PM25_640_UG_M3': 'PM2_5_6',
}

In [3]:
def convertWindDirection(df):
    df_cp = df.copy(deep=True)
    for c in df.columns:
        if "SONICWD_DEG" in c or "@" in c:
            df_c = df[c]
            df_c.name = df_c.name.replace("@", "")
            df_c_cos = np.cos(np.deg2rad(df_c))
            df_c_sin = np.sin(np.deg2rad(df_c))
            df_c_cos.name += "cosine"
            df_c_sin.name += "sine"
            df_cp.drop([c], axis=1, inplace=True)
            df_cp[df_c_cos.name] = df_c_cos
            df_cp[df_c_sin.name] = df_c_sin
    return df_cp

In [4]:
def preprocess(df):

    # null treatment
    df.iloc[:, 1:] = df.iloc[:, 1:].replace(-1.0, np.nan)
    df.iloc[:, 1:] = df.iloc[:, 1:].fillna(df.iloc[:, 1:].mean())
    df['smell'] = df['smell'].fillna('no smell')

    # wind direction conversion
    df = convertWindDirection(df)

    # standardization
    # df_mean = df.iloc[:, 1:].mean()
    # df_std = df.iloc[:, 1:].std()
    # df.iloc[:, 1:] = (df.iloc[:, 1:] - df_mean) / df_std

    scaler = MinMaxScaler()
    df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])


    # # smell distribution
    # import matplotlib.pyplot as plt
    # import seaborn as sns
    # smell_counts = df['smell'].value_counts()
    # plt.figure(figsize=(10, 6))
    # sns.barplot(x=smell_counts.index, y=smell_counts.values)
    # plt.title('Distribution of "Smell" in Zipcode 15221')
    # plt.xlabel('Smell')
    # plt.ylabel('Distribution')
    # plt.xticks(rotation=45)  # Rotar las etiquetas del eje x para mejor legibilidad
    # plt.tight_layout()
    # plt.show()

    # one hot encoding
    df = pd.get_dummies(df, columns=['smell'])
    return df

## Model

In [5]:
import numpy as np
import pandas as pd
from collections import namedtuple, Counter

import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_regression
from sklearn.preprocessing import MinMaxScaler

In [6]:
def get_val(row, target, target_val):
    return int(all(int(row[target[i]]) == int(target_val[i]) for i in range(len(target))))

def train_regression(df,conditional,conditional_values,target,target_val):
    new_lst=[]
    count=0
    for index,row in df.iterrows():
        new_lst.append(get_val(row,target,target_val))
        if new_lst[-1]==1:
            count+=1
    if len(conditional)==0:
        return count*1.0/df.shape[0]
    if len(list(set(new_lst)))==1:
        if new_lst[0]==1:
            return 1
        else:
            return 0
    
    if len(conditional)>0:
        X=df[conditional]
    else:
        X=df

    regr = RandomForestRegressor(random_state=0)
    regr.fit(X.values, new_lst)
    return regr

def get_prob_o_regression(df,conditional,conditional_values,target,target_val):
    new_lst=[]
    count=0
    for index,row in df.iterrows():
        new_lst.append(get_val(row,target,target_val))
        if new_lst[-1]==1:
            count+=1
    if len(conditional)==0:
        return count*1.0/df.shape[0]
    if len(list(set(new_lst)))==1:
        if new_lst[0]==1:   
            return 1
        else:
            return 0
        
    if len(conditional)>0:
        X=df[conditional]
    else:
        X=df
    regr = RandomForestRegressor(random_state=0)
    regr.fit(X.values, new_lst)
    return (regr.predict([conditional_values])[0])

def get_query_output(df, q_type, AT, prelst, prevallst, postlst, postvallst, Ac, c):
    if q_type == 'count':
        conditioning_set = prelst + Ac
        conditioning_val = prevallst + c
        print(f"conditioning set: {conditioning_set}, {conditioning_val}")
        print(f"post condition: {postlst}, {postvallst}")
        regr = train_regression(df, conditioning_set, conditioning_val, postlst, postvallst)
        pogivenck = regr.predict([conditioning_val])[0]

        pcgivenk = get_prob_o_regression(df, prelst, prevallst, Ac, c)
        total_prob = pogivenck * pcgivenk
        
        return total_prob


#### Prepare data ZIPCODE 15221 for the training

In [7]:
df = pd.read_csv('data/merged_data_15221.csv')


df.rename(columns=columns, inplace=True)
df.rename(columns={'smell_1': 'smell'}, inplace=True)
columns_to_evaluate = df.filter(regex='^(?!smell_)').columns.tolist()
df = df.dropna(subset=columns_to_evaluate, how='all')
df.drop(columns=['DateTime','smell_2','smell_3'], inplace=True)

df = preprocess(df)

In [8]:
df

Unnamed: 0,CO_PPB,NO2_PPB,NOX_PPB,NO_PPB,PM2_5_,SIGTHETA_DEG,SONICWS_MPH,PM2_5_4,SONICWD_DEGcosine,SONICWD_DEGsine,...,smell_gas,smell_industrial,smell_no smell,smell_sewage,smell_smog,smell_smoke,smell_sulfur,smell_tar,smell_trash,smell_woodsmoke
0,0.056196,0.290909,0.076155,0.021538,0.252525,0.361179,0.100000,0.152284,0.004866,0.569587,...,0,1,0,0,0,0,0,0,0,0
1,0.047740,0.298701,0.063550,0.004308,0.232323,0.171990,0.142857,0.141004,0.000000,0.500000,...,0,0,0,0,0,0,1,0,0,0
2,0.065618,0.529870,0.215336,0.127385,0.161616,0.369779,0.157143,0.101523,0.000305,0.517450,...,0,1,0,0,0,0,0,0,0,0
3,0.070298,0.587013,0.242122,0.145846,0.202020,0.128993,0.228571,0.124083,0.004866,0.430413,...,0,0,0,0,0,0,0,0,0,0
4,0.250503,0.503896,0.456933,0.417231,0.202020,0.528256,0.057143,0.124083,0.014853,0.379039,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,0.144867,0.382550,0.121339,0.052519,0.179789,0.292292,0.207039,0.110364,0.002151,0.453673,...,0,0,0,0,0,0,0,0,0,0
1656,0.144867,0.382550,0.121339,0.052519,0.179789,0.292292,0.207039,0.110364,0.002151,0.453673,...,0,0,0,0,0,0,0,0,0,0
1657,0.144867,0.382550,0.121339,0.052519,0.179789,0.292292,0.207039,0.110364,0.002151,0.453673,...,0,1,0,0,0,0,0,0,0,0
1658,0.144867,0.382550,0.121339,0.052519,0.179789,0.292292,0.207039,0.110364,0.002151,0.453673,...,0,1,0,0,0,0,0,0,0,0


In [9]:
scores = {}

columns_to_evaluate = ['CO_PPB']
# columns_to_evaluate = df.filter(regex='^(?!smell_)').columns.tolist()

target_columns = ['smell_egg']
# target_columns = df.filter(regex='^smell_').columns.tolist()

target_value = 1

for target_column in target_columns:
    scores[target_column] = {}
    print(f"Evaluating probabilities for {target_column}")
    for col in columns_to_evaluate:
        print(f"Evaluating {col} for {target_column}")
        values = list(set(df[col].values))
        scores[target_column][col] = []
        for v in values:
            result = get_query_output(df, 'count', '', [], [], [target_column], [target_value], [col], [v])
            scores[target_column][col].append({'value': v, 'result': result})
            print(f"Score for {col} = {v}: {result}")

Evaluating probabilities for smell_egg
Evaluating CO_PPB for smell_egg
conditioning set: ['CO_PPB'], [0.036783549451392616]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.036783549451392616: 0.6288614457831325
conditioning set: ['CO_PPB'], [0.2564259955497583]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.2564259955497583: 0.039927710843373494
conditioning set: ['CO_PPB'], [1.0]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 1.0: 0.00045488382099827864
conditioning set: ['CO_PPB'], [0.06578684876851071]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.06578684876851071: 0.07985542168674699
conditioning set: ['CO_PPB'], [0.0]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.0: 0.6188795180722891
conditioning set: ['CO_PPB'], [0.0947901480856288]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.0947901480856288: 0.1996385542168675
conditioning set: ['CO_PPB'], [0.07804803191897491]
post condition: ['smell_egg'], [1]
Score for CO_PPB = 0.0780

In [10]:
scores

{'smell_egg': {'CO_PPB': [{'value': 0.036783549451392616,
    'result': 0.6288614457831325},
   {'value': 0.2564259955497583, 'result': 0.039927710843373494},
   {'value': 1.0, 'result': 0.00045488382099827864},
   {'value': 0.06578684876851071, 'result': 0.07985542168674699},
   {'value': 0.0, 'result': 0.6188795180722891},
   {'value': 0.0947901480856288, 'result': 0.1996385542168675},
   {'value': 0.07804803191897491, 'result': 0.0},
   {'value': 0.2976904780173406, 'result': 0.0},
   {'value': 0.024522366300928414, 'result': 0.0},
   {'value': 0.5676820379037828, 'result': 0.49565024383247264},
   {'value': 0.1003299317118085, 'result': 0.17967469879518072},
   {'value': 0.10481086472799818, 'result': 0.0},
   {'value': 0.017800966776643903, 'result': 0.08983734939759036},
   {'value': 0.12485229801273691, 'result': 0.24954819277108434},
   {'value': 0.0713266323946904, 'result': 0.6987349397590361},
   {'value': 0.11364996547226272, 'result': 0.009981927710843374},
   {'value': 0.

In [12]:
correlation = df['CO_PPB'].corr(df['smell_egg'])
print(f"Correlation between H2S_PPM and smell_egg: {correlation}")

Correlation between H2S_PPM and smell_egg: -0.010537666834498223


In [13]:
with open('scores_15221.json', 'w') as f:
    json.dump(scores, f)

#### Prepare data ZIPCODE 15133 for the training

In [14]:
df = pd.read_csv('data/merged_data_15133.csv')

df.rename(columns=columns, inplace=True)
df.rename(columns={'smell_1': 'smell'}, inplace=True)
columns_to_evaluate = df.filter(regex='^(?!smell_)').columns.tolist()
df = df.dropna(subset=columns_to_evaluate, how='all')
df.drop(columns=['DateTime','smell_2','smell_3'], inplace=True)

df = preprocess(df)

In [15]:
df

Unnamed: 0,H2S_PPM,SO2_PPM,SIGTHETA_DEG,SONICWS_MPH,PM10_,PM2_5_5,PM_2_5_2,SONICWD_DEGcosine,SONICWD_DEGsine,smell_burning,...,smell_coke,smell_egg,smell_gas,smell_industrial,smell_no smell,smell_sewage,smell_smoke,smell_sulfur,smell_tar,smell_trash
0,0.12500,0.056338,0.202346,0.045249,0.198529,0.159091,0.155039,0.888573,0.184360,0,...,0,0,0,1,0,0,0,0,0,0
1,0.02500,0.014085,0.445748,0.194570,0.007353,0.022727,0.000000,0.242481,0.070289,0,...,0,0,0,0,1,0,0,0,0,0
2,0.45000,0.422535,0.080645,0.303167,0.360294,0.227273,0.263566,0.050603,0.279959,0,...,0,1,0,0,0,0,0,0,0,0
3,0.00000,0.014085,0.291789,0.402715,0.044118,0.075758,0.046512,0.560935,0.002512,0,...,0,0,0,1,0,0,0,0,0,0
4,0.00000,0.028169,0.167155,0.307692,0.102941,0.090909,0.062016,0.969846,0.670660,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.13964,0.109060,0.294124,0.186147,0.211713,0.186804,0.166017,0.005837,0.423152,0,...,0,0,0,0,1,0,0,0,0,0
230,0.13964,0.109060,0.294124,0.186147,0.211713,0.186804,0.166017,0.005837,0.423152,0,...,0,0,0,1,0,0,0,0,0,0
231,0.13964,0.109060,0.294124,0.186147,0.211713,0.186804,0.166017,0.005837,0.423152,0,...,0,0,0,1,0,0,0,0,0,0
232,0.13964,0.109060,0.294124,0.186147,0.211713,0.186804,0.166017,0.005837,0.423152,0,...,0,0,1,0,0,0,0,0,0,0


In [16]:
scores = {}

columns_to_evaluate = ['H2S_PPM']
target_columns = ['smell_egg']
target_value = 1

for target_column in target_columns:
    scores[target_column] = {}
    print(f"Evaluating probabilities for {target_column}")
    for col in columns_to_evaluate:
        print(f"Evaluating {col} for {target_column}")
        values = list(set(df[col].values))
        scores[target_column][col] = []
        for v in values:
            result = get_query_output(df, 'count', '', [], [], [target_column], [target_value], [col], [v])
            scores[target_column][col].append({'value': v, 'result': result})
            print(f"Score for {col} = {v}: {result}")

Evaluating probabilities for smell_egg
Evaluating H2S_PPM for smell_egg
conditioning set: ['H2S_PPM'], [0.125]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.125: 0.11554011147921403
conditioning set: ['H2S_PPM'], [0.44999999999999996]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.44999999999999996: 0.33307051282051275
conditioning set: ['H2S_PPM'], [0.0]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.0: 0.038729773405456205
conditioning set: ['H2S_PPM'], [0.325]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.325: 0.0
conditioning set: ['H2S_PPM'], [0.625]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.625: 0.4946864061864062
conditioning set: ['H2S_PPM'], [0.22499999999999998]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.22499999999999998: 0.0
conditioning set: ['H2S_PPM'], [0.7000000000000001]
post condition: ['smell_egg'], [1]
Score for H2S_PPM = 0.7000000000000001: 0.0
conditioning set: ['H2S_PPM'], [0.475]
post conditi

In [17]:
scores

{'smell_egg': {'H2S_PPM': [{'value': 0.125, 'result': 0.11554011147921403},
   {'value': 0.44999999999999996, 'result': 0.33307051282051275},
   {'value': 0.0, 'result': 0.038729773405456205},
   {'value': 0.325, 'result': 0.0},
   {'value': 0.625, 'result': 0.4946864061864062},
   {'value': 0.22499999999999998, 'result': 0.0},
   {'value': 0.7000000000000001, 'result': 0.0},
   {'value': 0.475, 'result': 0.45671445529778865},
   {'value': 0.2, 'result': 0.0},
   {'value': 0.25, 'result': 0.0},
   {'value': 1.0, 'result': 0.0},
   {'value': 0.375, 'result': 0.0},
   {'value': 0.35000000000000003, 'result': 0.0},
   {'value': 0.05, 'result': 0.0},
   {'value': 0.8500000000000001, 'result': 0.0},
   {'value': 0.7250000000000001, 'result': 0.0},
   {'value': 0.575, 'result': 0.040303215303215296},
   {'value': 0.95, 'result': 0.0},
   {'value': 0.8250000000000001, 'result': 0.0},
   {'value': 0.15, 'result': 0.14446116383616375},
   {'value': 0.1, 'result': 0.0007112332112332113},
   {'va

In [18]:
correlation = df['H2S_PPM'].corr(df['smell_egg'])
print(f"Correlation between H2S_PPM and smell_egg: {correlation}")

Correlation between H2S_PPM and smell_egg: 0.014338741059645095


In [19]:
with open('scores_15133.json', 'w') as f:
    json.dump(scores, f)