In [None]:
import os 
import joblib
## Data manipulation
import pandas as pd
import numpy as np

## Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

## visuatlization
import plotly 
import plotly.express as px
import plotly.graph_objects as go


In [None]:
# from datalearn19intro import (get_accounts, get_events, get_subscriptions, get_users)
# acc = get_accounts()
# events = get_events()
# subs = get_subscriptions()
# users = get_users()

In [None]:
## todo relative path 
data = joblib.load(os.path.join(os.getcwd(), 'Data','preprocessed_data_500K.jblib'))

# Split to train test

In [None]:
y = data['lead_score']
X = data.loc[:,(data.columns != 'lead_score') & (data.columns != 'account_id')]

# Smaller data to work with in the workshop...
max_samples = 10000
X = X[:max_samples]
y = y[:max_samples]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

In [None]:
## check if classes ratio is the same in train and test
ratio_train = y_train.sum() / (1-y_train).sum()
ratio_test = y_test.sum() / (1-y_test).sum()
print(ratio_train, ",", ratio_test)

# Preprocess

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('X_train[:10]\n', X_train[:10])
print('X_train_scaled[:10]\n', X_train_scaled[:10])

# Logistic Regression

In [None]:
# clf = LogisticRegression(multi_class='ovr', solver='sag', max_iter=10000, class_weight='balanced').fit(X_train_scaled, y_train)
clf = LogisticRegression(multi_class='ovr', solver='sag', max_iter=10000).fit(X_train_scaled, y_train)


In [None]:
# predict probabilities:
proba = clf.predict_proba(X_test)
proba[:10]

In [None]:
# predict decision:
pred = clf.predict(X_test)
print('pred\n======\n', pred[:10])
print('y_test\n======\n', y_test[:10])

## Metrics

In [None]:
print('f1_score:', f1_score(y_test, pred, average='binary'))
print('recall:', recall_score(y_test, pred, average='binary'))
print('precision:', precision_score(y_test, pred, average='binary'))

In [None]:
## Feature Selection

In [2]:
# We want to enforce a sparse weight vector. 
clf_sparse = LogisticRegression(multi_class='ovr', solver='saga', penalty='l1', max_iter=10000).fit(X_train_scaled, y_train)
nonzero_feats = np.nonzero(clf_sparse.coef_)
feature_names = list(X_train.columns)
selected = [f for i, f in enumerate(feature_names) if i in nonzero_feats[1]]
not_selected = [f for i, f in enumerate(feature_names) if i not in nonzero_feats[1]]
print('Feature Selection yielded %d selected features and %d not-selected features.' % (len(not_selected), len(selected)))
      
print('Selected features are:\n')
print(', '.join(selected))

NameError: name 'LogisticRegression' is not defined

# NN