# HyperFrame

The aim of this project is to provide a high-dimensional analogue to the two-dimensional pandas DataFrame.

This allows its user to organise information where the interaction of several factors is of interest.

The HyperFrame allows for the easy setting and saving of data for storage, and the fast, interactive creation of two-dimensional pandas DataFrames of any combination of two factors for data exploration.

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from hyperframe import HyperFrame
from sklearn.model_selection import train_test_split
from demo.helpers import metrics, X, y

In [2]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# Initialisation

In [4]:
dimension_labels = ["train_test", "species", "metric"]

index_labels = {"train_test": ["train", "test"],
                "species": ["setosa", "versicolor", "virginica"],
                "metric": ["precision", "recall", "f1"]}

scores = HyperFrame(dimension_labels, index_labels)

# Setting data

In [5]:
yhat = clf.predict(X_train)
#iset alternative 1
scores.iset(metrics(y_train, yhat), "train", "", "")

<hyperframe.HyperFrame at 0x7f38c19d9eb8>

In [6]:
yhat = clf.predict(X_test)
#iset alternative 2
scores.iset(metrics(y_test, yhat), train_test="test")

<hyperframe.HyperFrame at 0x7f38c19d9eb8>

# Getting data

In [7]:
#iget alternative 1
scores.iget("train", "", "", return_type="pandas").round(2)

Unnamed: 0,precision,recall,f1
setosa,0.97,0.9,0.93
versicolor,0.74,0.91,0.82
virginica,0.93,0.76,0.84


In [8]:
#iget alternative 2
scores.iget(species="versicolor", return_type="pandas").round(2)

Unnamed: 0,precision,recall,f1
train,0.74,0.91,0.82
test,0.56,0.67,0.61


In [9]:
#iget alternative 3
scores.iget0("species", "train_test", return_type="pandas").round(2)

{'metric': 'precision'}


Unnamed: 0,setosa,versicolor,virginica
train,0.97,0.74,0.93
test,0.94,0.56,0.73


#### Initialising a second HyperFrame

In [10]:
scores_lr = HyperFrame(dimension_labels, index_labels)
clf = LogisticRegression(penalty="none", max_iter=1000)
clf.fit(X_train, y_train)

yhat = clf.predict(X_train)
scores_lr.iset(metrics(y_train, yhat), "train", "", "")

yhat = clf.predict(X_test)
scores_lr.iset(metrics(y_test, yhat), "test", "", "")

<hyperframe.HyperFrame at 0x7f38c18f01d0>

# Merging

In [11]:
print("scores shape: {}".format(scores.shape))
print("scores_lr shape: {}".format(scores_lr.shape))

scores shape: (2, 3, 3)
scores_lr shape: (2, 3, 3)


In [12]:
scores_models = scores.merge(scores_lr, "model", ["knn", "logistic regression"])

In [13]:
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)

Unnamed: 0,knn,logistic regression
setosa,0.89,0.89
versicolor,0.61,0.47
virginica,0.71,0.62


In [14]:
scores_models.iget("", "", "f1", "logistic regression", return_type="pandas").round(2)

Unnamed: 0,setosa,versicolor,virginica
train,0.92,0.73,0.77
test,0.89,0.47,0.62


#### Initialising a third HyperFrame

In [15]:
scores_rf = HyperFrame(dimension_labels, index_labels)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

yhat = clf.predict(X_train)
scores_rf.iset(metrics(y_train, yhat), "train", "", "")

yhat = clf.predict(X_test)
scores_rf.iset(metrics(y_test, yhat), "test", "", "")

<hyperframe.HyperFrame at 0x7f38c18f0898>

In [16]:
scores_rf.iget("test", "", "", return_type="pandas").round(2)

Unnamed: 0,precision,recall,f1
setosa,0.95,0.95,0.95
versicolor,0.54,0.47,0.5
virginica,0.61,0.69,0.65


# Expanding A DataFrame

In [17]:
print("scores_models shape: {}".format(scores_models.shape))
print("scores_rf shape: {}".format(scores_rf.shape))

scores_models shape: (2, 3, 3, 2)
scores_rf shape: (2, 3, 3)


In [18]:
scores_models = scores_models.expand(scores_rf, "model", "random forest")

In [19]:
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)

Unnamed: 0,knn,logistic regression,random forest
setosa,0.89,0.89,0.95
versicolor,0.61,0.47,0.5
virginica,0.71,0.62,0.65


# Writing to file

In [20]:
scores_models.write_file("./demo/scores_models")

# Reading from file

In [21]:
scores_models = scores_models.read_file("./demo/scores_models")