In [1]:
import numpy as np

vec2 = np.array([[ 1.37239431],
                 [-1.16675093],
                 [-1.32467119],
                 [ 6.59925245]])

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def classification_model(row):
    return sigmoid(row @ vec2).round().astype(bool)

# Typical ML steps:

1. train model
2. test model (evaluate: is it good?)
3. make new predictions

Using the same data for steps 1+2 can produce misleading results!  Better to split data into testing and traning datasets.

# sklearn Estimators

1. `fit(X, y)`
2. `score(X, y)`
3. `predict(X)` (or `.coef_` and `.intercept_`)

In [2]:
import pandas as pd
df = pd.read_csv("df.csv")
df.head()

Unnamed: 0,x1,x2,x3,one,y,z
0,8.973478,2.207377,9.061502,1,12.848586,True
1,8.356966,5.851814,1.047791,1,32.750062,True
2,3.70232,3.078078,7.41674,1,2.742693,False
3,8.535779,2.445276,5.936232,1,18.801234,True
4,9.286952,1.654095,3.647136,1,26.139734,True


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression   # regression model
from sklearn.linear_model import LogisticRegression # classification model

In [4]:
train, test = train_test_split(df, train_size=0.5, stratify=df["z"])
train.head()

Unnamed: 0,x1,x2,x3,one,y,z
13,5.053321,3.542724,4.550394,1,12.500436,True
66,8.277397,1.004268,3.464437,1,23.713679,True
74,6.808463,7.072885,9.215683,1,12.673961,False
23,8.259205,2.572212,8.426069,1,14.774285,True
33,6.912245,8.943233,5.636122,1,21.872859,False


In [5]:
train["z"].value_counts()

True     30
False    20
Name: z, dtype: int64

In [6]:
test["z"].value_counts()

True     31
False    19
Name: z, dtype: int64

In [7]:
lr = LogisticRegression()
lr.fit(train.loc[:, "x1":"x3"], train["z"]) # 1. train

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
lr.score(train.loc[:, "x1":"x3"], train["z"]) # not a real measure of perf

0.98

In [9]:
lr.score(test.loc[:, "x1":"x3"], test["z"]) # 2. test

0.9

In [10]:
lr.predict([[100,200,10]])

array([False])

In [11]:
lr.coef_, lr.intercept_

(array([[ 1.42757118, -0.93253474, -0.93349587]]), array([2.94964951]))

# Practice

In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

features = 50
rows = 50
x = np.random.uniform(0,10,rows)
df = pd.DataFrame({f"x{i}": x+np.random.normal(size=x.size, scale=5) for i in range(features)})
df["y"] = x > 5
train, test = train_test_split(df, test_size=0.5)
train.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x41,x42,x43,x44,x45,x46,x47,x48,x49,y
31,10.315059,10.760968,8.786167,13.193551,9.423619,9.423556,7.093098,10.874292,3.93658,4.476895,...,6.693343,14.187943,8.323768,13.990539,13.082446,1.89123,8.633574,0.752379,14.713517,True
0,1.030302,5.116757,4.00138,-10.386396,11.300361,6.447294,6.067272,-1.675419,5.865653,4.103602,...,-3.468903,3.528641,-1.780201,0.655541,4.664122,2.645642,-7.749456,-1.044071,1.171304,False
36,-0.797863,8.332517,1.233429,6.350662,7.084271,13.789061,9.981648,14.360578,2.249592,3.89907,...,4.68877,7.5907,11.292887,7.750538,7.279855,-4.375554,12.805295,-3.837977,7.571411,True
6,4.721581,2.652721,-5.671712,5.789889,7.594718,5.518626,-1.304606,10.140444,2.043874,10.454265,...,10.861666,2.228563,15.582119,7.633287,3.472359,18.419822,1.010726,7.152993,-1.522687,True
1,0.30477,0.744419,7.281961,2.760371,6.682569,5.762155,-2.027797,10.393485,5.639179,8.879192,...,10.901712,-0.564909,0.252937,4.042727,2.378738,-0.521538,3.366625,1.692641,7.149033,False


In [13]:
lr = LogisticRegression()
lr.fit(train.iloc[:, :-1], train["y"])
lr.score(train.iloc[:, :-1], train["y"]), lr.score(test.iloc[:, :-1], test["y"])

(1.0, 0.96)