In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler,OneHotEncoder

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [None]:
### Importing dataset
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
chronic_kidney_disease = fetch_ucirepo(id=336) 
  
# data (as pandas dataframes) 
X = chronic_kidney_disease.data.features 
y = chronic_kidney_disease.data.targets 

y.replace("ckd\t", "ckd", inplace=True)

# metadata 
print(chronic_kidney_disease.metadata) 

In [None]:
# variable information 
print(chronic_kidney_disease.variables) 

In [None]:
print(X.shape)

In [None]:
print(X.dtypes)

In [None]:
cat = ['rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane']

for col in cat:
    X[col] = pd.Categorical(X[col]).codes

X.head()

### 1.
We need to classify whether the patient has chronic kidney disease.

In [None]:
### Q2
# Standarize?
scaler=StandardScaler()
scaled_X = scaler.fit_transform(X)

### Q2.
Standarized numeric values of X, to lessen the effect of mean and variance on classification dataset.

In [None]:
### Q3
print(scaled_X.shape)

In [None]:
scaled_X = pd.DataFrame(scaled_X, columns=X.columns)
print(scaled_X.head())

In [None]:
for column in scaled_X.columns:
    plt.hist(scaled_X[column], bins=20)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column}')
    plt.show()

### Q3
- Some covariates are skewed to the left(sod,hemo,pcv), some covariates are skewed to the right(bgr,bu,sc,pot,wbcc).
- None of the categorical variables have balanced outcome.
- Some covariates(bp,sg,al,su) only have certain values. It means they are categorical.

In [None]:
### Q4
correlation_matrix = scaled_X.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap for numeric variables")
plt.show()

### Q4.
- 'pot' and 'wbcc' do not seem to have relations with other covariates, while 'sg' and 'pcv' seem to be heavily correlated with other variables.
- There seems to be more negative correlation than postivie correlation.
- 'hemo','pcv','sg','al','rbcc' seems to be heavily correlated with each other.

In [None]:
### Q5
# Change the values first
revised_X = X.copy()

revised_X.update(scaled_X)

# Replace NaN with mean
mean_values = revised_X.mean()
revised_X.fillna(mean_values, inplace=True)

print(revised_X.shape)
print(revised_X.dtypes)

In [None]:
### Q6
# From the numerical variables, find outliers
z_scores = stats.zscore(revised_X)
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
scaled_X_removed = scaled_X[~outliers]

# Fill NaN with mean value
scaled_X_removed = scaled_X_removed.fillna(scaled_X_removed.mean())

scaled_X_removed.shape

In [None]:
### Q7
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_X_removed)

kmeans = KMeans(n_clusters=2, random_state=142857)
kmeans.fit(principal_components)
clusters = kmeans.fit_predict(scaled_X_removed)

plt.figure(figsize=(8, 6))
plt.scatter(principal_components[:, 0], principal_components[:, 1], c=clusters, cmap='viridis')
plt.title('K-means Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
### Q8
y_removed = y[~outliers]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X_removed, y_removed, test_size=0.3, random_state=1)

### Q9
- One can use classification tree, since it is most natural choice.
- The other method for classification would be perception, or using coin flip. 50-60% score would be assigned.

In [None]:
cs_dt = DecisionTreeClassifier(
    max_depth = 10, 
    random_state=0
) 

In [None]:
cs_dt.fit(X_train, y_train)

In [None]:
# Q10
pred = cs_dt.predict(X_test)
pred[:5]

In [None]:
y_train.value_counts()

In [None]:
plot_tree(
    cs_dt, 
    max_depth= 2, 
    feature_names = X_train.columns.tolist(), 
    class_names=['notckd', 'ckd']
)

In [None]:
cm = pd.DataFrame(confusion_matrix(y_test, pred), index=['No', 'Yes'], columns=['No', 'Yes'])
cm.index.name = 'True'
cm.columns.name = 'Predicted'
cm

In [None]:
cs_dt.score(X_test, y_test)

In [None]:
print(classification_report(y_test, pred))

### Q10
- From classification tree, we get score over 97%.
- With comparison using perception or coin flip, it will be hard to get to similar score.

In [None]:
### Q11
# Updating classification tree...
cs_dt_best = DecisionTreeClassifier(
    max_depth = 5, 
    random_state=0
    ) 
cs_dt_best.fit(X, y)

In [None]:
fea_imp = cs_dt_best.feature_importances_

In [None]:
sorted_indices = fea_imp.argsort()[::-1]# read from the tail of the argsort to get greatest to least indices of the elements
sorted_feature_names = X_train.columns[sorted_indices]
sorted_importances = fea_imp[sorted_indices]

In [None]:
sns.barplot(x = sorted_importances, y = sorted_feature_names)
plt.show()

### Q11-Q12
- The importance of variates vary heavily, so including only certain variates may be a better way for classification.
- Classification tree seems to be a good way of classifying observations.

In [None]:
# Q13
feature_importances = classifier.feature_importances_
indices = np.argsort(feature_importances)[::-1]
feature_names = X_train.columns

plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), feature_importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), feature_names[indices], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

feature_names[indices][:2]

### Q13
- 'hemo' and 'sg' are two most important features. Since they are directly related to health, it is natural that they are important.
- The fact that age is not very important implies that CKD is acquired rather than congenital.

In [None]:
# Q14
## We can try random forest classifier to improve classification using classification tree
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest_classifier.fit(X_train, y_train)

accuracy = random_forest_classifier.score(X_test, y_test)
print(accuracy)

### Q14
The accuary of the classification improved from 0.97 to 1.0.

### Q16
https://github.com/FoolyM/3DA3HW6

### Helper's Name
Jihwan Kim(400182249)