In [1]:
import os
import numpy as np
import pandas as pd
from interface.apis import KGFarm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

RANDOM_STATE = 7
kgfarm = KGFarm()
np.random.seed(RANDOM_STATE)

KGFarm is running in HUMAN-IN-THE-LOOP mode!


## Load dataset as a Dataframe

In [2]:
df = kgfarm.load_titanic_dataset()
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Seperate independent (features or 'X') and dependent variable (target or 'y') 

In [3]:
df = df.dropna()
X = df.drop('Survived', axis=1)
y = df['Survived']
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


## Use KGFarm for Data Transformation recommendations

In [4]:
transformation_info = kgfarm.recommend_transformations(X=X)
transformation_info

Unnamed: 0,Feature,Recommended_transformation,Transformation_type
0,"[Cabin, Embarked, Name, Sex, Ticket]",OrdinalEncoder,categorical
1,"[PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]",StandardScaler,scaling
2,"[Age, Fare, PassengerId, Pclass, SibSp]",Log,unary


## Applying transformations directly using KGFarm (no explicit code needed)
It is recommended to apply categorical transformations followed by scaling and unary transformations

In [5]:
X, encoder = kgfarm.apply_transformations(X=X, recommendation=transformation_info.iloc[0])
X

Applying OrdinalEncoder on ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,49.0,0.0,38.0,1,0,109.0,71.2833,72.0,0.0
3,4,1,70.0,0.0,35.0,1,0,31.0,53.1000,48.0,2.0
6,7,1,112.0,1.0,54.0,0,0,55.0,51.8625,117.0,2.0
10,11,3,148.0,0.0,4.0,1,1,120.0,16.7000,131.0,2.0
11,12,1,27.0,0.0,58.0,0,0,26.0,26.5500,43.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,18.0,0.0,47.0,1,1,33.0,52.5542,91.0,2.0
872,873,1,35.0,1.0,33.0,0,0,89.0,5.0000,29.0,2.0
879,880,1,140.0,0.0,56.0,0,1,38.0,83.1583,61.0,0.0
887,888,1,75.0,0.0,19.0,0,0,10.0,30.0000,25.0,2.0


In [6]:
X, scaler = kgfarm.apply_transformations(X=X, recommendation=transformation_info.iloc[1])
X

Applying StandardScaler on ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,-1.840135,-0.372256,-0.795052,-1.039012,0.149065,0.833628,-0.631730,1.264647,-0.097180,0.087510,-1.339397
3,-1.832017,-0.372256,-0.397526,-1.039012,-0.043230,0.833628,-0.631730,-0.892052,-0.335997,-0.535411,0.755557
6,-1.819841,-0.372256,0.397526,0.962453,1.174636,-0.723044,-0.631730,-0.228452,-0.352250,1.255486,0.755557
10,-1.803606,3.520480,1.078999,-1.039012,-2.030273,0.833628,0.697081,1.568797,-0.814070,1.618857,0.755557
11,-1.799547,-0.372256,-1.211507,-1.039012,1.431029,-0.723044,-0.631730,-1.030302,-0.684702,-0.665187,0.755557
...,...,...,...,...,...,...,...,...,...,...,...
871,1.691045,-0.372256,-1.381875,-1.039012,0.725949,0.833628,0.697081,-0.836752,-0.343166,0.580655,0.755557
872,1.695104,-0.372256,-1.060069,0.962453,-0.171426,-0.723044,-0.631730,0.711647,-0.967737,-1.028557,0.755557
879,1.723516,-0.372256,0.927560,-1.039012,1.302832,-0.723044,0.697081,-0.698502,0.058785,-0.197996,-1.339397
887,1.755986,-0.372256,-0.302877,-1.039012,-1.068801,-0.723044,-0.631730,-1.472701,-0.639390,-1.132377,0.755557


In [7]:
X, u_transformation = kgfarm.apply_transformations(X=X, recommendation=transformation_info.iloc[2])
X

Applying Log on ['Age', 'Fare', 'PassengerId', 'Pclass', 'SibSp']


  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,-9.210340,-9.210340,-0.795052,-1.039012,0.865781,0.442614,-0.631730,1.264647,-0.065791,0.087510,-1.339397
3,-4.801470,-9.210340,-0.397526,-1.039012,0.781417,0.442614,-0.631730,-0.892052,-0.360240,-0.535411,0.755557
6,-3.892508,-9.210340,0.397526,0.962453,1.224490,-9.210340,-0.631730,-0.228452,-0.383817,1.255486,0.755557
10,-3.306903,1.359138,1.078999,-1.039012,-1.621903,0.442614,0.697081,1.568797,-1.516696,1.618857,0.755557
11,-3.201815,-9.210340,-1.211507,-1.039012,1.297142,-9.210340,-0.631730,-1.030302,-1.053244,-0.665187,0.755557
...,...,...,...,...,...,...,...,...,...,...,...
871,1.261660,-9.210340,-1.381875,-1.039012,1.083074,0.442614,0.697081,-0.836752,-0.370570,0.580655,0.755557
872,1.262809,-9.210340,-1.060069,0.962453,0.720942,-9.210340,-0.631730,0.711647,-2.721601,-1.028557,0.755557
879,1.270814,-9.210340,0.927560,-1.039012,1.261476,-9.210340,0.697081,-0.698502,0.088278,-0.197996,-1.339397
887,1.279884,-9.210340,-0.302877,-1.039012,0.147553,-9.210340,-0.631730,-1.472701,-0.931109,-1.132377,0.755557


## Select subset of features for predicting target (if needed)
this is recommended for datasets with > 100 features

In [8]:
features = kgfarm.recommend_features_to_be_selected(task='binary', X=X, y=y)
X = X[features['Feature']]

## Split data into Train and Test sets 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Train & Evaluate model performance 

In [10]:
model = RandomForestClassifier(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
print(f'Accuracy: {model.score(X_test, y_test)*100:.1f}%')

Accuracy: 78.4%
