In [12]:
### Setup
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score, r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.2f' % x) # change decimal places

In [None]:
### Get Data
df_train = pd.read_csv('data/Train.csv')

### Columns to keep
### List of weather and satellite measurement_columns to keep
col_keep = list(df_train.columns[0:14]) + ['L3_AER_AI_absorbing_aerosol_index', # AER
                # CLOUD
                'L3_CLOUD_cloud_base_height',
                'L3_CLOUD_cloud_fraction',
                'L3_CLOUD_cloud_optical_depth',
                ## NO2
                'L3_NO2_NO2_column_number_density',
                'L3_NO2_absorbing_aerosol_index',
                'L3_NO2_cloud_fraction',
                ## CO
                'L3_CO_CO_column_number_density',
                'L3_CO_cloud_height',
                ## HCHO
                'L3_HCHO_tropospheric_HCHO_column_number_density',
                'L3_HCHO_cloud_fraction',
                ## O3
                'L3_O3_O3_column_number_density',
                'L3_O3_cloud_fraction',
                ## SO2
                'L3_SO2_SO2_column_number_density',
                'L3_SO2_absorbing_aerosol_index',
                'L3_SO2_cloud_fraction'
                ]

In [None]:
###
# plt.clf()
# for col in col_keep[18:]:
    
#    plotname = "./plots/hist_" + col + ".png"
#    print (plotname)
#    sns.histplot(data = df_train[col])
#    plt.savefig(plotname)
#    plt.clf()
###
    

In [None]:
### Define sets of columns for preprocessing
### Columns to keep in X for train test split
col_X = [col_keep[2]] + col_keep[8:len(col_keep)]

### Columns with meaningless 0 values
col_zero = ["L3_NO2_absorbing_aerosol_index", "L3_CO_CO_column_number_density", "L3_O3_O3_column_number_density", "L3_O3_cloud_fraction", "L3_SO2_absorbing_aerosol_index"]

### Numeric columns without meaningless, inflated zeros and some of them NAN - treated together to remove NaNs
col_nan = col_X[1:7] + col_keep[14:]

### Categorical columns to Integer Values - Place_id
col_cat = ["Place_ID"]

### Columns for transforming - Windspeed - do this first, since no one else has done it in the pipelines
col_wind = col_keep[12:14]

In [None]:
print (col_zero)
print (col_nan)

### Are all col_zeros in nan?
def x_in_y(x, y):
    len_x = len(x)
    len_y = len(y)

    k = 0
    for i in x:
        if i in y: 
            k += 1
    return k, len_x

### How many of col_zero are in col_nan
print(x_in_y(col_zero, col_nan))

### All values of col_zero are in col_nan
print(len(col_nan))

### Remove col_zero values from col_nan
for i in col_zero:
    col_nan.remove(i)

### Have they changed?
print(len(col_nan))
print(x_in_y(col_zero, col_nan))


In [None]:
### Create a custom Transformer to include in the Pipeline for Transforming Wind_Speed
class WindSpeedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, add_combined_windspeed=True): # no *args or **kargs
        self.add_combined_windspeed = add_combined_windspeed

    def fit(self, X, y=None):
        return self  # nothing else to do - since no values are stored, and therefore no fitting is applied to test data
    
    def transform(self, X):
        combined_windspeed = np.sqrt((X["u_component_of_wind_10m_above_ground"])**2 + (X["v_component_of_wind_10m_above_ground"])**2)

        if self.add_combined_windspeed:
            return np.c_[combined_windspeed] # Return only new windspeed column casted to 2D array
        else:
            return X    

In [None]:
df_x = df_train[col_X]
y = df_train.target

X_train, X_test, y_train, y_test = train_test_split(df_x, y, test_size = 0.3, random_state = 666, shuffle = True)

In [None]:
### 3 Categories:
### column with meaningless zeros and nans
# col_zero
# 1 replace nan with zero
# 2 replace all zeros with median
# 3 scale the data
zero_pipeline = Pipeline([
    ("col_zero_nan_imputer", SimpleImputer(strategy="constant", fill_value=0)), # Replace NaNs with zero
    ("col_zero_zero_imputer", SimpleImputer(strategy="median", missing_values=0)) # Replace Zeros with Median
    #, ("col_zero_standard_scaler", StandardScaler()) Scaling
])

### columns without meaningless zeros but nans, 
### can also include the features without meaningless zeros and without nans (weather measurements)
# col_nan
# 1 replace Nan with median
# 2 scale the data
nan_pipeline = Pipeline([
    ("col_nan_imputer", SimpleImputer(strategy = "median")) # Replace NaN with median
    #, ("col_nan_standard_scaler", StandardScaler()) Scaling
])

### Columns for transforming wind-speed
# col_wind
# 1 create new windspeed column from both np.sqrt((u)**2 + (v)**2)
# 2 drop u and v?
# 3 scale the data
wind_pipeline = Pipeline([
    ("col_wind_windspeed_transformer", WindSpeedTransformer()) # Apply Windspeedtransformation. Drops the original columns
    #, ("col_wind_standard_scaler", StandardScaler()) Scaling 
])

### columns with categorical variable but no nans
# col_cat
# 1 transform objects into integers for random forests
cat_pipeline = Pipeline([
    ("col_cat_OrdinalEncoder", OrdinalEncoder())
])

In [None]:
### Construct Preprocessor Pipeline of single Pipes
preprocessor = ColumnTransformer([
    ("col_zero_transformer", zero_pipeline, col_zero),
    ("col_nan_transformer", nan_pipeline, col_nan),
    ("wind_transformer", wind_pipeline, col_wind),
    ("cat_transformer", cat_pipeline, col_cat) # 1D Data Error happening here
],  remainder = "drop") # To drop all columns but col_zero for testing purposes (standard)

### Run the simple pipeline for testing with col_zero only
pipe_dectree = Pipeline([
    ("preprocessor", preprocessor),
    ("dec_tree_zero", DecisionTreeRegressor(criterion = "squared_error"))
])

### Fit Y Data for single decision tree
pipe_dectree.fit(X_train, y_train)

In [13]:
### Predict Data and Get Errors for singel model
y_train_predict = pipe_dectree.predict(X_train)

y_test_predict = pipe_dectree.predict(X_test)

### R^2
r2_score(y_train_predict, y_train)

### Depth

### Number of leaves


1.0