In [None]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
%%time
water_dataset = pd.read_pickle("../pickled_data/water_dataset.pkl")
water_dataset = water_dataset
training_ratio = 0.6
training_set, test_set = train_test_split(water_dataset[["dist_p", "en_p", "emission"]], train_size=training_ratio)

In [None]:
training_set

In [None]:
test_set

In [None]:
def emission_frequency(df, col="emission"):
    return len(df[df[col] == 1])/len(df)

In [None]:
print(emission_frequency(training_set))
print(emission_frequency(test_set))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

def log_reg_features(df):
    polyfeat = PolynomialFeatures(degree=3)
    return np.concatenate([df[["dist_p", "en_p"]], polyfeat.fit_transform(df[["dist_p", "en_p"]]), df[["en_p"]].apply(lambda x: np.exp(-x)), df[["dist_p"]].apply(lambda x: np.exp(-x))], axis=1)

In [None]:
%%time
clf_logreg = LogisticRegression(class_weight="balanced")
clf_logreg.fit(log_reg_features(training_set), training_set["emission"])

In [None]:
test_set["log_prediction"] = clf_logreg.predict(log_reg_features(test_set))
test_set["random_prediction"] = np.random.binomial(1, emission_frequency(training_set), len(test_set))

In [None]:
def print_metrics(col):
    print(f'F1: {sklearn.metrics.f1_score(test_set["emission"], test_set[col])}')
    print(f'Recall: {sklearn.metrics.recall_score(test_set["emission"], test_set[col])}')
    print(f'Precision: {sklearn.metrics.precision_score(test_set["emission"], test_set[col])}')

In [None]:
print_metrics("log_prediction")

In [None]:
print_metrics("random_prediction")

In [None]:
emission_frequency(test_set, "log_prediction")

In [None]:
emission_frequency(test_set, "random_prediction")

In [None]:
import pickle

filename = '../model_parameters/water/emission_prediction.sav'
with open(filename, 'wb') as f:
    pickle.dump(clf_logreg, f)