In [16]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
%%time
water_dataset = pd.read_pickle("../pickled_data/water_dataset.pkl")
water_dataset = water_dataset
training_ratio = 0.6
training_set, test_set = train_test_split(water_dataset[["dist_p", "en_p", "emission"]], train_size=training_ratio)

CPU times: user 642 ms, sys: 892 ms, total: 1.53 s
Wall time: 1.54 s


In [18]:
training_set

Unnamed: 0,dist_p,en_p,emission
313,397.0,2.2,1
23656,50.2,2.6,1
57047,99.9,3.3,1
12827,242.0,5.2,1
58741,120.0,1.5,1
...,...,...,...
29917,129.0,1.1,1
22710,540.0,2.5,1
61178,58.0,3.5,1
70268,454.0,5.0,1


In [19]:
test_set

Unnamed: 0,dist_p,en_p,emission
58498,595.0,5.6,1
22745,14.9,2.7,1
2738,185.0,1.8,1
30969,43.6,0.1,1
10892,336.0,4.1,1
...,...,...,...
5511,96.4,3.1,1
57960,97.1,0.2,1
18020,48.8,1.0,1
58876,237.0,2.5,1


In [20]:
def emission_frequency(df, col="emission"):
    return len(df[df[col] == 1])/len(df)

In [21]:
print(emission_frequency(training_set))
print(emission_frequency(test_set))

0.9980394465403654
0.9980859443819033


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

def log_reg_features(df):
    polyfeat = PolynomialFeatures(degree=3)
    return np.concatenate([df[["dist_p", "en_p"]], polyfeat.fit_transform(df[["dist_p", "en_p"]]), df[["en_p"]].apply(lambda x: np.exp(-x)), df[["dist_p"]].apply(lambda x: np.exp(-x))], axis=1)

In [23]:
%%time
clf_logreg = LogisticRegression(class_weight="balanced")
clf_logreg.fit(log_reg_features(training_set), training_set["emission"])

CPU times: user 1min 16s, sys: 1min 2s, total: 2min 18s
Wall time: 11.8 s


In [24]:
test_set["log_prediction"] = clf_logreg.predict(log_reg_features(test_set))
test_set["random_prediction"] = np.random.binomial(1, emission_frequency(training_set), len(test_set))

In [25]:
def print_metrics(col):
    print(f'F1: {sklearn.metrics.f1_score(test_set["emission"], test_set[col])}')
    print(f'Recall: {sklearn.metrics.recall_score(test_set["emission"], test_set[col])}')
    print(f'Precision: {sklearn.metrics.precision_score(test_set["emission"], test_set[col])}')

In [26]:
print_metrics("log_prediction")

F1: 0.9930413501779587
Recall: 0.9873080457313815
Precision: 0.9988416302270949


In [27]:
print_metrics("random_prediction")

F1: 0.9980704919038234
Recall: 0.998055775189657
Precision: 0.9980852090520033


In [28]:
emission_frequency(test_set, "log_prediction")

0.9865610857605268

In [29]:
emission_frequency(test_set, "random_prediction")

0.998056510497872

In [30]:
import pickle

filename = '../model_parameters/water/emission_prediction.sav'
with open(filename, 'wb') as f:
    pickle.dump(clf_logreg, f)