In [1]:
from scipy.io import arff
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load the data
data, meta = arff.loadarff('./dataset/primary-tumor.arff')
print("Number of observations: ", len(data), "\nNumber of features: ", len(data[0]))

Number of observations:  339 
Number of features:  18


# Introduction

The tumor dataset contains 339 observations each with 18 features. These variables are all categorical variables. I choose to use parameters to predict the positive and negative of the binary class variable. That is, treating the binary class feature as a response variable, and then there are 17 explanatory variables. The first task is to clean the dataset. For missing values, I took two distinct approaches: 1. removing rows with any missing value and 2. patching the missing values in the dataset. For the patching missing data approach, following a data cleansing convention, I choose to have the missing values replaced by the mode or mean value of the same category. After data cleansing, I will build a logistic regression model to predict the response using necessary explanatory variables. The performance of the model will be evaluated with cross validation(CV), and the results from two data cleansing approaches will be compared against each other.

In [3]:
np.unique(data['degree-of-diffe']), np.unique(data['age'])

(array([b'?', b'fairly', b'poorly', b'well'], dtype='|S6'),
 array([b'30-59', b'<30', b'>=60'], dtype='|S5'))

In [4]:
df = pd.DataFrame(data, dtype=str)

In [5]:
for name, col in df.items():
    print(f"colname: {name}, num. of missing value:", sum(col == '?'))

colname: age, num. of missing value: 0
colname: sex, num. of missing value: 1
colname: histologic-type, num. of missing value: 67
colname: degree-of-diffe, num. of missing value: 155
colname: bone, num. of missing value: 0
colname: bone-marrow, num. of missing value: 0
colname: lung, num. of missing value: 0
colname: pleura, num. of missing value: 0
colname: peritoneum, num. of missing value: 0
colname: liver, num. of missing value: 0
colname: brain, num. of missing value: 0
colname: skin, num. of missing value: 1
colname: neck, num. of missing value: 0
colname: supraclavicular, num. of missing value: 0
colname: axillar, num. of missing value: 1
colname: mediastinum, num. of missing value: 0
colname: abdominal, num. of missing value: 0
colname: binaryClass, num. of missing value: 0


# Identify the rows with missing values

In [6]:
bad_rows = []
for col in ['sex', 'skin', 'axillar', 'histologic-type']:
    bad_rows += df.index[df[col] == '?'].tolist()
bad_rows = np.unique(bad_rows)
print(f"Number of bad rows: {len(bad_rows)}")

Number of bad rows: 69


# Patch the missing values

Looking at the missing values, we can see that the missing values are in the columns `sex`, `histologic-type`, `degree-of-diffe`, `skin`, `axillar`. For `sex`, `skin`, `axillar`, there is only one missing value in each column, so I choose to patch it with the mode value of that column. On the other hand, `degree-of-diffe` has too many missing values, nearly half of the observations. In this case, this column will be removed from the analysis in the next steps. Lastly, for `histologic-type`, I choose to follow the expected occurrence to fill in the values for the missing data.

In [7]:
for col in ['sex', 'skin', 'axillar']:
    df[col] = df[col].replace('?', df[col].mode()[0])

In [8]:
val, counts = np.unique(df['histologic-type'], return_counts=True)
print(val, counts)
counts = counts[1:]

['?' 'adeno' 'anaplastic' 'epidermoid'] [ 67 220   8  44]


In [9]:
np.random.seed(0)
fills = np.random.choice(val[1:], size=67, p=[counts[0]/sum(counts), counts[1]/sum(counts), counts[2]/sum(counts)])
fill_val, fill_counts = np.unique(fills, return_counts=True)
print("Filled value and occurrence:")
for v, c in zip(fill_val, fill_counts):
    print(v, c)
for i, idx in enumerate(df[df["histologic-type"] == "?"].index):
    df['histologic-type'][idx] = fills[i]

Filled value and occurrence:
adeno 57
anaplastic 2
epidermoid 8


In [10]:
# drop the degree-of-diffe column
df = df.drop(columns=['degree-of-diffe'])

In [11]:
# convert the data to numerics and create the design matrix
design = pd.get_dummies(df.drop(columns='binaryClass'), dtype=int)
response = df['binaryClass'].map({'N': 0, 'P': 1})

In [12]:
design_drop = pd.get_dummies(df.drop(bad_rows).drop(columns='binaryClass'), dtype=int)
response_drop = df.drop(bad_rows)['binaryClass'].map({'N': 0, 'P': 1})

# Build the logistic regression model

Since the response variable is a binary class, it is convenient to build a logistic
regression model to predict the response using the exploratory variables. The performance of the model is measured using 5-fold CV.
A comparison between removing the rows of the missing values in the table after removing the column `degree-of-diffe` and using imputation to fill the values are performed and discussed as well.

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [14]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
print("By removing rows with any missing value, the mean accuracy is: ", cross_val_score(clf, design_drop, response_drop, cv=5).mean())
print("By patching the missing values, the mean accuracy is: ", cross_val_score(clf, design, response, cv=5).mean())

By removing rows with any missing value, the mean accuracy is:  0.8703703703703705
By patching the missing values, the mean accuracy is:  0.8435908691834945


In [15]:
clf.fit(design, response)
clf.coef_

array([[ 0.0636175 , -0.07612299,  0.01276981, -0.31280611,  0.31307042,
        -1.40393297,  0.58507355,  0.81912374, -0.25740756,  0.25767187,
        -0.1327541 ,  0.13301842,  0.41347863, -0.41321432, -0.30357582,
         0.30384013,  0.18991419, -0.18964988, -0.20165245,  0.20191677,
        -0.39657215,  0.39683646, -0.82501741,  0.82528172,  1.16425717,
        -1.16399286, -0.44311131,  0.44337562,  0.57452116, -0.57425685,
        -1.25150799,  1.2517723 ,  0.50728716, -0.50702285]])

# Results and discussion

From the analysis above, we can see that the 17 explanatory variables correlate fairly well with the binary class response variable. The 5-fold cross-validation shows a mean accuracy of 0.8436 for the approach with patching the missing values, and 0.8704 for removing the rows with missing values. The removing missing value row approach has a slightly better accuracy, but it might lose some generalities for inferencing with new data because our number of observations is reduced by 69 after removing those missing value rows.