In [None]:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Arize-ai/client_python/blob/main/arize/examples/tutorials/Arize_HelloWorld_classification.ipynb)

In [None]:
!wget https://storage.googleapis.com/arize-assets/tutorials/b_open_source_dataset.csv

In [None]:
import pandas as pd
import sklearn
from sklearn import metrics
import statsmodels.formula.api as smf
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


model_data = pd.read_csv('b_open_source_dataset.csv',delimiter=";",header='infer')

In [None]:
model_data

In [None]:
#Converting object type data into numeric type using One-Hot encoding method which is
#majorly used for XGBoost (for better accuracy) [Applicable only for non numeric categorical features]
data_new = pd.get_dummies(model_data, columns=['job','marital',
                                         'education','default',
                                         'housing','loan',
                                         'contact','month',
                                         'poutcome'])
#pd is instance of pandas. Using get_dummies method we can directly convert any type of data into One-Hot encoded format.

In [None]:
#Since y is a class variable we will have to convert it into binary format. (Since 2 unique class values)
data_new.y.replace(('yes', 'no'), (1, 0), inplace=True)

In [None]:
#Spliting data as X -> features and y -> class variable
data_y = pd.DataFrame(data_new['y'])
data_X = data_new.drop(['y'], axis=1)
print(data_X.columns)
print(data_y.columns)

In [None]:
#Dividing records in training and testing sets along with its shape (rows, cols)
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.3, random_state=2, stratify=data_y)
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

In [None]:
import time
from xgboost import XGBClassifier
# create a default XGBoost classifier
model = XGBClassifier(n_estimators=500, random_state=0)
# define the eval set and metric


In [None]:
#Create an XGB classifier and train it on 70% of the data set.
from sklearn import svm
from xgboost import XGBClassifier
clf = XGBClassifier()
clf

In [None]:
clf.fit(X_train, y_train.values.ravel())


In [None]:
y_pred = clf.predict(X_test)


In [None]:
y_pred

In [None]:
# final model assessment
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
pred_test = clf.predict(X_test)
pred_train = clf.predict(X_train)
print('Train Accuracy: ', accuracy_score(y_train, pred_train))
print('Test Accuraccy: ', accuracy_score(y_test, pred_test))
print('Classification Report:')
print(classification_report(y_test,pred_test,digits=5))

In [None]:
!pip install arize

In [None]:
#Creating feature data to send
#X_Test is 1 hot encoded lets get features pre: 1-hot to send back more human readable
readable_features = model_data.loc[X_test.index].drop(labels=['y'],axis=1)

print(readable_features)

In [None]:
from arize.api import Client
from arize.types import ModelTypes
#ORGINIZATION KEY - SUPPLIED BY ARIZE
org_key = 'ORG_KEY'
#API KEY - GENERATED IN ARIZE ACCOUNT OR SUPPLIED
api_key = 'API_KEY'

arize_client = Client(organization_key=org_key, api_key=api_key)

In [None]:
import datetime
model_name = 'colab_model_class'
#This colab generates a different Model ID every run / you don't have to do it this way & can send traffic by build
datetime_rightnow = datetime.datetime.today()
model_version_id_now = 'test_' + datetime_rightnow.strftime('%m_%d_%Y__%H_%M_%S')


In [None]:
#Turn Predictions into strings - classification 1/0
pred = pd.DataFrame([str(x) for x in pred_test]) # (going to add to SDK to handle this)
ids = pd.DataFrame([str(x) for x in X_test.index]) 
tfuture = arize_client.log_bulk_predictions(
    model_id=model_name, 
    model_version=model_version_id_now,
    model_type =ModelTypes.CATEGORICAL,
    features=readable_features,
    prediction_ids=ids,
    prediction_labels=pred)

In [None]:
tfuture[0].result()

In [None]:
#Y_test is a DataFrame convert to str for classification versus 1/0 (going to add to SDK to handle this)
actuals_df = y_test.astype(str)
tfuture = arize_client.log_bulk_actuals(model_id=model_name, model_type =ModelTypes.CATEGORICAL, prediction_ids=ids, actual_labels=actuals_df)