In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
%%time
lung_dataset = pd.read_pickle("../pickled_data/lung_dataset.pkl")
training_ratio = 0.6
training_set = lung_dataset.sample(frac=training_ratio)[["dist_p", "en_p", "emission"]]
test_set = lung_dataset.drop(training_set.index)[["dist_p", "en_p", "emission"]]

CPU times: total: 3min 25s
Wall time: 3min 26s


In [3]:
training_set

Unnamed: 0,dist_p,en_p,emission
348699,168.0,1.0,1
658821,1000.0,0.5,0
598193,354.0,1.0,1
74668,695.0,10.0,1
481001,735.0,0.2,1
...,...,...,...
891513,452.0,0.1,1
578526,927.0,0.5,1
531790,733.0,0.1,1
403575,139.0,0.2,1


In [4]:
test_set

Unnamed: 0,dist_p,en_p,emission
3779,309.0,0.1,1
10648,585.0,0.1,1
21331,274.0,0.1,1
23471,248.0,0.1,1
23751,587.0,0.1,1
...,...,...,...
178402,729.0,20.0,1
179570,905.0,20.0,1
181474,786.0,20.0,1
183324,6.3,20.0,1


In [5]:
def emission_frequency(df, col="emission"):
    return len(df[df[col] == 1])/len(df)

In [6]:
print(emission_frequency(training_set))
print(emission_frequency(test_set))

0.9526090396180347
0.9542344239133446


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

def log_reg_features(df):
    polyfeat = PolynomialFeatures(degree=4)
    return np.concatenate([np.ones((len(df), 1)), df[["dist_p", "en_p"]], polyfeat.fit_transform(df[["dist_p", "en_p"]]), df[["en_p"]].apply(lambda x: np.exp(-x)), df[["dist_p"]].apply(lambda x: np.exp(-x))], axis=1)

In [44]:
%%time
clf_logreg = LogisticRegression(class_weight="balanced")
clf_logreg.fit(log_reg_features(training_set), training_set["emission"])

CPU times: total: 1min 10s
Wall time: 25.8 s


In [45]:
test_set["log_prediction"] = clf_logreg.predict(log_reg_features(test_set))
test_set["random_prediction"] = np.random.binomial(1, emission_frequency(training_set), len(test_set))

In [46]:
def print_metrics(col):
    print(f'F1: {sklearn.metrics.f1_score(test_set["emission"], test_set[col])}')
    print(f'Recall: {sklearn.metrics.recall_score(test_set["emission"], test_set[col])}')
    print(f'Precision: {sklearn.metrics.precision_score(test_set["emission"], test_set[col])}')

In [47]:
print_metrics("log_prediction")

F1: 0.9755720740416849
Recall: 0.9716069095659747
Precision: 0.9795697351090297


In [25]:
print_metrics("random_prediction")

F1: 0.9533218078537911
Recall: 0.9525330236609086
Precision: 0.9541118994998256


In [48]:
emission_frequency(test_set, "log_prediction")

0.9464775466105216

In [26]:
emission_frequency(test_set, "random_prediction")

0.9526553453195556