In [None]:
import numpy as np
import pandas as pd
import shap
import seaborn as sns
import warnings
np.random.seed(10)
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager as fm, rcParams
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
file = './ESCC/dataset/GSE53625.csv'

In [None]:
raw_data = pd.read_csv(file)
print(raw_data.shape)
raw_data.head()

In [None]:
train_data = raw_data.iloc[:,1:]
train_data = train_data.drop('OS.time', axis=1)
print(train_data.shape)
train_data.head()

In [None]:
X = train_data.drop('OS', axis=1)
y = train_data['OS']
print(X.shape)
print(y.shape)
print(y.value_counts())

In [None]:
scaler = MinMaxScaler()  # MinMaxScaler
X_scaled = scaler.fit_transform(X)

In [None]:
selector = SelectKBest(chi2, k=300)
X_selected = selector.fit_transform(X_scaled, y)

In [None]:
selected_indices = selector.get_support(indices=True)

selected_feature_names = X.columns[selected_indices]
selected_feature_names

In [None]:
train_x, test_x, train_bin_y, test_bin_y = train_test_split(X_selected, y, test_size=0.2, random_state=42)
print(train_x.shape)
print(test_x.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, random_state=2022)  # criterion="gini", max_depth=5, 
RF.fit(train_x, train_bin_y)  

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_y_pred = RF.predict(test_x)
print(classification_report(test_bin_y,rf_y_pred))

In [None]:
from tpot import TPOTClassifier
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

accuracy = tpot.score(X_test, y_test)
print("accuracy:", accuracy)

In [None]:
feature_importances = RF.feature_importances_

# DataFrame
feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})

# sort
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

k = 150
selected_features = feature_importance_df.iloc[:k, :]['Feature'].values
selected_features

In [None]:
X_15 = X[selected_features]
X_15.shape

In [None]:
new_scaler = MinMaxScaler()  # MinMaxScaler
X15_scaled = new_scaler.fit_transform(X_15)

In [None]:
new_selector = SelectKBest(chi2, k=50) 
X15_selected = new_selector.fit_transform(X15_scaled, y)

In [None]:
selected_indices = new_selector.get_support(indices=True)

selected_feature_names = X.columns[selected_indices]

In [None]:
train_x, test_x, train_bin_y, test_bin_y = train_test_split(X15_selected, y, test_size=0.2, random_state=42)
print(train_x.shape)
print(test_x.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, random_state=2022)  # criterion="gini", max_depth=5, 
RF.fit(train_x, train_bin_y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_y_pred = RF.predict(test_x)
print(classification_report(test_bin_y,rf_y_pred))

In [None]:
feature_importances = RF.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

k = 30
selected_features = feature_importance_df.iloc[:k, :]['Feature'].values
selected_features

In [None]:
X_30 = X[selected_features]
X_30.shape

In [None]:
new_scaler = MinMaxScaler()  # MinMaxScaler
X30_scaled = new_scaler.fit_transform(X_30)

In [None]:
new_selector = SelectKBest(chi2, k=20)
X30_selected = new_selector.fit_transform(X30_scaled, y)

In [None]:
train_x, test_x, train_bin_y, test_bin_y = train_test_split(X30_selected, y, test_size=0.2, random_state=42)
print(train_x.shape)
print(test_x.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, random_state=2022)  # criterion="gini", max_depth=5, 
RF.fit(train_x, train_bin_y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_y_pred = RF.predict(test_x)
print(classification_report(test_bin_y,rf_y_pred))