# Library human-learn
https://towardsdatascience.com/human-learn-rule-based-learning-as-an-alternative-to-machine-learning-baf1899ecb3a
human-learn is a Python package to create rule-based systems that are easy to construct and are compatible with scikit-learn.

You are given a labeled dataset and assigned to predict a new one. What would you do?

The first approach that you probably try is to train a machine learning model to find rules for labeling new data.
This is convenient, but it is challenging to know why the machine learning model comes up with a particular prediction. You also can’t incorporate your domain knowledge into the model.

Instead of depending on a machine learning model to make predictions, is there a way to set the rules for data labeling based on your knowledge

In [2]:
#pip install human-learn

In [6]:
#libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#data
dLink = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00357/'
https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip

In [7]:
# Get train and test data
train = pd.read_csv("E:/analytics/data/occupancy_data/datatraining.txt").drop(columns="date")
test = pd.read_csv("E:/analytics/data/occupancy_data/datatest.txt").drop(columns="date")
print(train.shape)
print(train.shape)

(8143, 6)
(8143, 6)


In [8]:
# Get X and y
target = "Occupancy"
train_X, train_y = train.drop(columns=target), train[target]
val_X, val_y = test.drop(columns=target), test[target]

In [9]:
train.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,23.18,27.272,426.0,721.25,0.004793,1
2,23.15,27.2675,429.5,714.0,0.004783,1
3,23.15,27.245,426.0,713.5,0.004779,1
4,23.15,27.2,426.0,708.25,0.004772,1
5,23.1,27.2,426.0,704.5,0.004757,1


In [10]:
# Train
forest_model = RandomForestClassifier(random_state=1)

# Preduct
forest_model.fit(train_X, train_y)
machine_preds = forest_model.predict(val_X)

# Evalute
print(classification_report(val_y, machine_preds))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1693
           1       0.95      0.92      0.93       972

    accuracy                           0.95      2665
   macro avg       0.95      0.95      0.95      2665
weighted avg       0.95      0.95      0.95      2665



In [11]:
#

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
feature = "Light"
px.box(data_frame=train, x=target, y=feature)

In [15]:
#

In [16]:
import numpy as np
from hulearn.classification import FunctionClassifier


def create_rule(data: pd.DataFrame, col: str, threshold: float=100):
    return np.array(data[col] > threshold).astype(int)


mod = FunctionClassifier(create_rule, col='Light')

In [17]:
mod.fit(train_X, train_y)
preds = mod.predict(val_X)
print(classification_report(val_y, preds))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98      1693
           1       0.93      1.00      0.96       972

    accuracy                           0.97      2665
   macro avg       0.96      0.98      0.97      2665
weighted avg       0.97      0.97      0.97      2665



# Improve Rules

In [18]:
from hulearn.experimental.interactive import parallel_coordinates

parallel_coordinates(train, label=target, height=200)

In [19]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(mod, cv=2, param_grid={"threshold": np.linspace(250, 750, 1000)})
grid.fit(train_X, train_y)

GridSearchCV(cv=2,
             estimator=FunctionClassifier(col='Light',
                                          func=<function create_rule at 0x000002181C9751F0>),
             param_grid={'threshold': array([250.        , 250.5005005 , 251.001001  , 251.5015015 ,
       252.002002  , 252.5025025 , 253.003003  , 253.5035035 ,
       254.004004  , 254.5045045 , 255.00500501, 255.50550551,
       256.00600601, 256.50650651, 257.00700701, 257.50750751,
       258.00800801, 258.50850851...
       734.48448448, 734.98498498, 735.48548549, 735.98598599,
       736.48648649, 736.98698699, 737.48748749, 737.98798799,
       738.48848849, 738.98898899, 739.48948949, 739.98998999,
       740.49049049, 740.99099099, 741.49149149, 741.99199199,
       742.49249249, 742.99299299, 743.49349349, 743.99399399,
       744.49449449, 744.99499499, 745.4954955 , 745.995996  ,
       746.4964965 , 746.996997  , 747.4974975 , 747.997998  ,
       748.4984985 , 748.998999  , 749.4994995 , 750.        ])}

In [20]:
best_threshold = grid.best_params_["threshold"]
best_threshold

364.61461461461465

In [21]:
human_preds = grid.predict(val_X)
print(classification_report(val_y, human_preds))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1693
           1       0.95      1.00      0.97       972

    accuracy                           0.98      2665
   macro avg       0.97      0.98      0.98      2665
weighted avg       0.98      0.98      0.98      2665



In [None]:
#

In [22]:
from hulearn.datasets import load_titanic

df = load_titanic(as_frame=True)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,fare,sibsp,parch
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,1,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,1,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,0,0
