In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import helpers.processing_helpers as ph
from sklearn.svm import LinearSVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

In [None]:
df_dev = pd.read_csv("./development.csv")

In [None]:
nan_stats = df_dev.isna().any(axis=0)

print(nan_stats.sum()) #  there are no NaN values

In [None]:
subset = df_dev.sample(df_dev.shape[0])

In [None]:
y_train_valid = subset[['x', 'y']].copy()

X_train_valid = subset.drop(columns=['x', 'y'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

In [None]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train , y_train)

In [None]:
med_with_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_with_noise)

In [None]:
sorted(zip(X_train_valid.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

In [None]:
noise_indexes = [0,7,12,15,16,17]
features = ["pmax", "negpmax", 'area', 'tmax', 'rms']

noise_removed = subset.drop(columns=ph.get_column_names(features, noise_indexes))

In [None]:
y_train_valid = noise_removed[['x', 'y']].copy()

X_train_valid = noise_removed.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

X_train["RANDOM"] = np.random.RandomState(42).randn(X_train.shape[0])

X_valid["RANDOM"] = np.random.RandomState(42).randn(X_valid.shape[0])

reg = RandomForestRegressor(100, random_state=42, verbose=1)
reg.fit(X_train , y_train)

In [None]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

In [None]:
sorted(zip(X_train.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

In [None]:
mlp = MLPRegressor(random_state=42, verbose=1, n_iter_no_change=500, max_iter=1500, learning_rate_init=0.0001, activation="logistic")
mlp.fit(X_train, y_train)

In [None]:
mlp_med = ph.mean_euclid_dist(y_valid, mlp.predict(X_valid))
print(mlp_med)

In [None]:
acc_idxs = [1,2,3,4,5,6,8,9,10,11,13,14]

tmax_removed = noise_removed.drop(columns=ph.get_column_names(['tmax'], acc_idxs))

In [None]:
y_train_valid = tmax_removed[['x', 'y']].copy()

X_train_valid = tmax_removed.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

X_train["RANDOM"] = np.random.RandomState(42).randn(X_train.shape[0])
X_valid["RANDOM"] = np.random.RandomState(42).randn(X_valid.shape[0])

# https://forecastegy.com/posts/feature-importance-in-random-forests/#built-in-scikit-learn-method-with-a-random-feature

reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train, y_train)

In [None]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

In [None]:
sorted(zip(X_train.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

In [None]:
pmax_negpmax_area = noise_removed.drop(columns=ph.get_column_names(['tmax', 'rms'], acc_idxs))

In [None]:
y_train_valid = pmax_negpmax_area[['x', 'y']].copy()

X_train_valid = pmax_negpmax_area.drop(columns=['x', 'y'])

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

In [None]:
reg = RandomForestRegressor(100, random_state=42)
reg.fit(X_train , y_train)

In [None]:
med_without_noise = ph.mean_euclid_dist(y_valid, reg.predict(X_valid))
print(med_without_noise)

In [None]:
sorted(zip(X_train_valid.columns, reg.feature_importances_), key=lambda x: x[1],reverse=True)

In [None]:
y_train_valid = pmax_negpmax_area[['x', 'y']].copy()

area_idx = (1,2,3,4,5,6,8,9,10,11,13,14)
X_train_valid = pmax_negpmax_area.drop(columns=['x', 'y']).drop(columns=ph.get_column_names(['area'], area_idx))

X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, shuffle=True, random_state=42)

In [None]:
mlp = MLPRegressor(random_state=42, verbose=1, n_iter_no_change=500, max_iter=1500, learning_rate_init=0.0001, activation="tanh")
mlp.fit(X_train, y_train)

In [None]:
mlp_med = ph.mean_euclid_dist(y_valid, mlp.predict(X_valid))
print(mlp_med)