In [None]:
import numpy as np
import pandas as pd
import shap
import seaborn as sns
import warnings
np.random.seed(10)
warnings.filterwarnings("ignore")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager as fm, rcParams
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
path = '/ESCC/dataset/GSE161533.xls'

In [None]:
train_data = pd.read_excel(path)
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data = train_data.iloc[:,1:]
train_data.head(2)

In [None]:
X = train_data.drop('OS', axis=1)
y = train_data['OS']
print(X.shape)
print(y.shape)

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(type(X_scaled))
X_scaled.shape

In [None]:
selector = SelectKBest(chi2, k=50)
X_selected = selector.fit_transform(X_scaled, y)
X_selected.shape

In [None]:
selected_indices = selector.get_support(indices=True)

selected_feature_names = X.columns[selected_indices]

print("Selected Features:")
print(selected_feature_names)

In [None]:
chi2_scores = selector.scores_

# DataFrame
feature_scores_df = pd.DataFrame({'Feature': X.columns, 'Chi2 Score': chi2_scores})

# sort
feature_scores_df = feature_scores_df.sort_values(by='Chi2 Score', ascending=False)

# top
top_50_features = feature_scores_df.head(50)

# plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Chi2 Score', y='Feature', data=top_50_features, palette='viridis')
plt.title('Top 50 Features selected by Chi-squared Test')
plt.xlabel('Chi2 Score')
plt.ylabel('Feature')
plt.show()

In [None]:
# split
train_x, test_x, train_bin_y, test_bin_y = train_test_split(X_selected, y, test_size=0.2, random_state=42)
print(train_x.shape)
print(test_x.shape)

### RF

In [29]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, random_state=2022)  # criterion="gini", max_depth=5, 
RF.fit(train_x, train_bin_y)  

RandomForestClassifier(random_state=2022)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_y_pred = RF.predict(test_x)
print(classification_report(test_bin_y,rf_y_pred)) 

In [None]:
feature_importances = RF.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')

In [None]:
RF_list15 = feature_importance_df.iloc[:15,0].to_list()
print(RF_list15)

In [None]:
new_X = train_data.loc[:,RF_list15]
new_y = train_data['OS']
print(new_X.shape)
print(new_y.shape)

In [44]:
new_scaler = MinMaxScaler()
new_X_scaled = new_scaler.fit_transform(new_X)

In [None]:
train_x, test_x, train_bin_y, test_bin_y = train_test_split(new_X_scaled, new_y, test_size=0.2, random_state=42)
print(train_x.shape)
print(test_x.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, criterion="gini", max_depth=2,random_state=2022)   
RF.fit(train_x, train_bin_y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_y_pred = RF.predict(test_x)
print(classification_report(test_bin_y,rf_y_pred))