In [14]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import KNNImputer

In [15]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

In [16]:
train_fts = train_set.columns

imputer_train = KNNImputer()
imputer_train.fit(train_set)
train_set = pd.DataFrame(imputer_train.transform(train_set))

train_set.columns = train_fts

In [17]:
X_test_set = test_set.drop(columns=['Target', 'Opportunity_ID'])
y_test_set = test_set.Target

X_test_fts = X_test_set.columns

imputer_test = KNNImputer()
imputer_test.fit(X_test_set)
X_test_set = pd.DataFrame(imputer_test.transform(X_test_set))

X_test_set.columns=X_test_fts

In [18]:
X = train_set.drop(columns=['Opportunity_ID', 'Target'])
y = train_set.Target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)

In [20]:
model = LogisticRegression()

random_search = RandomizedSearchCV(model, space, n_iter=100, \
                                   scoring='neg_log_loss', n_jobs=-1, cv=5, random_state=1, verbose=3)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 286 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.4min finished


RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x120cadf40>,
                                        'penalty': ['none', 'l1', 'l2',
                                                    'elasticnet'],
                                        'solver': ['newton-cg', 'lbfgs',
                                                   'liblinear']},
                   random_state=1, scoring='neg_log_loss', verbose=3)

In [21]:
random_search.best_params_

{'C': 2.976522963488366, 'penalty': 'l1', 'solver': 'liblinear'}

In [35]:
lr = LogisticRegression(**random_search.best_params_, max_iter=1000)

In [36]:
lr.fit(X_train, y_train)

LogisticRegression(C=2.976522963488366, max_iter=1000, penalty='l1',
                   solver='liblinear')

In [37]:
log_loss(y_test, lr.predict_proba(X_test))

0.3625415339921851

In [38]:
pp = lr.predict_proba(X_test_set)
pp

array([[0.43715523, 0.56284477],
       [0.52668631, 0.47331369],
       [0.47606389, 0.52393611],
       ...,
       [0.54651793, 0.45348207],
       [0.34248837, 0.65751163],
       [0.91789348, 0.08210652]])

In [39]:
log_loss(y_test_set, pp)

0.5385597713684864

In [26]:
model.score(test.drop(columns=['Target', 'Opportunity_ID', 'Prediction']), test['Target'])