In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# loading in the data
fruits = pd.read_table('fruit_data_with_colors.txt')
target_names = ['apple', 'mandarin', 'orange', 'lemon']
X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']

# creating a binary classification problem (apple vs everything else)
y_fruits_apple = y_fruits_2d == 1

X_train, X_test, y_train, y_test = train_test_split(
    X_fruits_2d.to_numpy(), 
    y_fruits_apple.to_numpy(), 
    random_state=0
)

In [None]:
# logistic regression is similar to linear regression but instead of the output being a
# continuous value, it is an estimate of how likely something is to belong to a certain class.
# It also differs from linear regression in that before the final value is received,
# it is passed through a function that squishes the value between 0 and 1
# in this example, we are saying that a value closer to 1 indicates an apple while a value
# below 1 indicates a different kind of fruit

# C controls the regularization parameter (L2)
clf = LogisticRegression(C=100).fit(X_train, y_train)

h = 6
w = 8
print("A fruit with height {} and width {} is predicted to be: {}".format(
    h,
    w,
    ['not an apple', 'an apple'][clf.predict([[6,8]])[0]]
))

print("accuracy of logistic regression (training): {}".format(clf.score(X_train, y_train)))
print("accuracy of logistic regression (test): {}".format(clf.score(X_test, y_test)))