In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
%%time
water_dataset = pd.read_pickle("../pickled_data/water_dataset.pkl")
water_dataset = water_dataset
training_ratio = 0.6
training_set, test_set = train_test_split(water_dataset[["dist_p", "en_p", "emission"]], train_size=training_ratio)

CPU times: user 1.19 s, sys: 866 ms, total: 2.05 s
Wall time: 2.09 s


In [3]:
training_set

Unnamed: 0,dist_p,en_p,emission
79347,7.89,1.1,1
46413,295.00,5.5,1
58541,281.00,3.5,1
76116,619.00,2.6,1
18417,18.80,3.5,1
...,...,...,...
70302,144.00,2.2,1
63661,106.00,4.7,1
1503,90.70,3.6,1
31032,1000.00,4.9,0


In [4]:
test_set

Unnamed: 0,dist_p,en_p,emission
88477,372.0,0.9,1
89356,141.0,6.0,1
10745,11.1,1.7,1
80395,94.1,3.2,1
15603,156.0,1.7,1
...,...,...,...
31317,253.0,2.0,1
44700,84.9,0.7,1
68908,291.0,5.2,1
58871,17.2,2.0,1


In [5]:
def emission_frequency(df, col="emission"):
    return len(df[df[col] == 1])/len(df)

In [6]:
print(emission_frequency(training_set))
print(emission_frequency(test_set))

0.9980621973783071
0.9980518181395481


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

def log_reg_features(df):
    polyfeat = PolynomialFeatures(degree=4)
    return np.concatenate([df[["dist_p", "en_p"]], polyfeat.fit_transform(df[["dist_p", "en_p"]]), df[["en_p"]].apply(lambda x: np.exp(-x)), df[["dist_p"]].apply(lambda x: np.exp(-x))], axis=1)

In [8]:
%%time
clf_logreg = LogisticRegression(class_weight="balanced")
clf_logreg.fit(log_reg_features(training_set), training_set["emission"])

CPU times: user 26 s, sys: 15.1 s, total: 41.1 s
Wall time: 7.07 s


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
test_set["log_prediction"] = clf_logreg.predict(log_reg_features(test_set))
test_set["random_prediction"] = np.random.binomial(1, emission_frequency(training_set), len(test_set))

In [10]:
def print_metrics(col):
    print(f'F1: {sklearn.metrics.f1_score(test_set["emission"], test_set[col])}')
    print(f'Recall: {sklearn.metrics.recall_score(test_set["emission"], test_set[col])}')
    print(f'Precision: {sklearn.metrics.precision_score(test_set["emission"], test_set[col])}')

In [11]:
print_metrics("log_prediction")

F1: 0.0
Recall: 0.0
Precision: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print_metrics("random_prediction")

F1: 0.9980792680190432
Recall: 0.9981061431740373
Precision: 0.9980523943112989


In [13]:
emission_frequency(test_set, "log_prediction")

0.0

In [14]:
emission_frequency(test_set, "random_prediction")

0.9981055669712576

In [16]:
import pickle

filename = '../model_parameters/water/emission_prediction.sav'
with open(filename, 'wb') as f:
    pickle.dump(clf_logreg, f)