In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile

!pip install plotly
import plotly.express as px

!pip install dataprep
from dataprep.eda import create_report

!pip install h2o
import h2o
from h2o.automl import H2OAutoML

In [None]:
#read data
cancer_df = pd.read_csv("C:\\Users\\Lelin\\Downloads\\cervical_cancer.csv")

In [None]:
#preview
cancer_df.head(10)

# Exploratory Data Analysis

In [None]:
#info
cancer_df.info()

In [None]:
#statistics
cancer_df.describe()

In [None]:
#replace '?' with NaN 
cancer_df = cancer_df.replace('?', np.nan)
cancer_df

In [None]:
#look at null
cancer_df.isnull().sum()

In [None]:
#plot heatmap for all locations that have null values
plt.figure(figsize = (10, 10))
sns.heatmap(cancer_df.isnull(), yticklabels=False)

In [None]:
#since STDs: Time since first diagnosis  and STDs: Time since last diagnosis have more than 80% missing values 
# Dx:Cancer raises concerns about the validity of the predictions
#So, we can drop them
cancer_df = cancer_df.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis','Dx:Cancer'], axis=1)
cancer_df.head()

In [None]:
cancer_df = cancer_df.drop(columns=['Hinselmann', 'Schiller', 'Citology'], axis=1)
cancer_df.head()

In [None]:
cancer_df = cancer_df.apply(pd.to_numeric)
cancer_df.info()

In [None]:
cancer_df.describe()

In [None]:
cancer_df.mean()

In [None]:
cancer_df = cancer_df.fillna(round(cancer_df.mean()))
cancer_df.head()

# Data Visualization

In [None]:
corr = cancer_df.corr()
corr

In [None]:
report = create_report(cancer_df, title='My Report')
report

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['Age'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['Number of sexual partners'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['First sexual intercourse'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['Num of pregnancies'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['Smokes'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['Hormonal Contraceptives'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['IUD'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['STDs'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['STDs (number)'],
              hue=cancer_df['Biopsy'])

In [None]:
plt.figure(figsize=(15, 6))
sns.countplot(x=cancer_df['STDs: Number of diagnosis'],
              hue=cancer_df['Biopsy'])

# Train H2O AutoML

In [None]:
h2o.init()

In [None]:
#convert pandas df into h2o frame
h2o_df = h2o.H2OFrame(cancer_df)

#preview
h2o_df

In [None]:
#statistics
h2o_df.describe()

In [None]:
#split train and test sets
train, test = h2o_df.split_frame(ratios=[0.75], seed=121)

In [None]:
#define X and y
X = train.columns
y = 'Biopsy'
X.remove(y)

In [None]:
#for binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [None]:
X

In [None]:
y

In [None]:
#initiate
aml = H2OAutoML(balance_classes=True,
                stopping_metric='AUC',     #for classification
                seed=121,
                max_models=20,
                exclude_algos = ["StackedEnsemble"])

#train
aml.train(x=X, 
          y=y, 
          training_frame=train)

# Leaderboard Exploration

In [None]:
#leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
#compare all models
exm = aml.explain(test)

In [None]:
#explain a single H2O model (e.g. leader model from AutoML)
exm = aml.leader.explain(test)

In [None]:
#make prediction
preds = aml.leader.predict(test)

In [None]:
#combine the prediction with the test dataset
df = test.cbind(preds)

#preview
df

In [None]:
df = h2o.as_list(df)

In [None]:
preds = df[['Biopsy', 'predict']]
preds

In [None]:
df.to_csv('df.csv')
preds.to_csv('preds.csv')

# Use LIME to explain H2O Models

In [None]:
!pip install lime
import lime
import lime.lime_tabular

In [None]:
feature_names = train.columns[0: -1]
feature_names

In [None]:
train_features_numpy = train[feature_names].as_data_frame().values
train_features_numpy

In [None]:
#define LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(train_features_numpy, 
                                                   feature_names = feature_names,
                                                   mode='classification')

In [None]:
#https://sefiks.com/2019/09/19/explaining-h2o-models-with-lime/

def findPrediction(instance):
    #instance will be in type of numpy
    df = pd.DataFrame(data = instance, columns = feature_names)
    hf = h2o.H2OFrame(df)
    predictions = aml.leader.predict(hf).as_data_frame()
 
    #here predictions object is 3 columned data frame. 1st column is class prediction and others are probabilities
    #lime needs just prediction probabilities
 
    predictions = predictions.iloc[:,1:].values
    return predictions

In [None]:
idx = 100
test_df = test.as_data_frame()
test_numpy = test_df.iloc[idx].values[0:-1]

In [None]:
test_numpy

In [None]:
exp = explainer.explain_instance(test_numpy, 
                                 findPrediction, 
                                 num_features = len(feature_names))
exp

In [None]:
exp.show_in_notebook(show_table=True, show_all=True)

In [None]:
exp.save_to_file('lime.html')

In [None]:
idx = 120
test_df = test.as_data_frame()
test_numpy = test_df.iloc[idx].values[0:-1]

exp = explainer.explain_instance(test_numpy, 
                                 findPrediction, 
                                 num_features = len(feature_names))

exp.show_in_notebook(show_table=True, show_all=True)

In [None]:
exp.save_to_file('lime1.html')

In [None]:
idx = 150
test_df = test.as_data_frame()
test_numpy = test_df.iloc[idx].values[0:-1]

exp = explainer.explain_instance(test_numpy, 
                                 findPrediction, 
                                 num_features = len(feature_names))

exp.show_in_notebook(show_table=True, show_all=True)

In [None]:
exp.save_to_file('lime2.html')