In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier ,RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso , LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from yellowbrick.regressor import PredictionError, ResidualsPlot
import random

In [24]:
def get_fullpath (dir) :
  fullpath = []
  filename = os.listdir(dir)
  filename = sorted(filename)
  for i in filename :
    fullname = os.path.join(dir , i)
    fullpath.append(fullname)
  return (fullpath)

In [25]:
def get_dataset (file_pvalue ,file0 , file1) :
  df_pvalue = pd.read_excel(file_pvalue , index_col = 0)
  feature_list = []
  for i in df_pvalue.index :
    if df_pvalue.at[i,'p_value'] < 0.04 :
        feature_list.append(i)
  df0 = pd.read_excel (file0 , index_col = 1)
  df1 = pd.read_excel (file1 , index_col = 1)
  list_data0 = []
  list_data1 = []
  for i in df0.columns[1:] :
    data0 = []
    for f in df0.index :
      if f in feature_list :
        data0.append(df0.at[f,i])
    list_data0.append(data0)
  for i in df1.columns[1:] :
    data1 = []
    for f in df1.index :
      if f in feature_list :
        data1.append(df1.at[f,i])
    list_data1.append(data1)
  data0_label = [[0] for _ in range(len(list_data0))]
  data1_label = [[1] for _ in range(len(list_data1))]
  x = np.vstack((list_data0 , list_data1))
  y = np.vstack((data0_label , data1_label))
  y = y.ravel()
  return (x,y)

In [26]:
def randomforest_test(x,y, times=100 ,test_radio = 0.2) :
  y = y.ravel()
  result = []
  num = 0
  confusion_matrix_list = []
  while num < times :
    x_train , x_test  , y_train , y_test = train_test_split(x,y,
                        test_size = test_radio)
    clf.fit(x_train, y_train)
    accuracy = clf.score(x_test , y_test)
    result.append(accuracy)
    y_pred = clf.predict(x_test)
    matrix = confusion_matrix(y_test , y_pred).ravel()
    confusion_matrix_list.append(matrix)
    num +=1
    random.Random(num).shuffle(x)
    random.Random(num).shuffle(y)

  return (result , confusion_matrix_list)

In [31]:
def Lasso_plus_Randomforest_test_v2 (x,y ,times = 100 , test_radio = 0.2) :
  y = y.ravel()
  features_selected_acc = []
  vector_list = []
  mask = []
  for i in range(100) :
    x_train , x_test  , y_train , y_test = train_test_split(x,y,
                        test_size = test_radio)
    lasso = Lasso(alpha=0.1)
    lasso.fit(x_train,y_train)
    vector = lasso.coef_
    vector_list.append(vector)
    feature_list = []
    for index,value in enumerate(vector) :
      if value != 0 :
        feature_list.append(index)
    mask.append(feature_list)
    if len(feature_list) >=1 :
      x_train_new = x_train[:,feature_list]
      x_test_new = x_test[:,feature_list]
      clf = RandomForestClassifier()
      clf.fit(x_train_new,y_train)
      acc = clf.score(x_test_new,y_test)
      features_selected_acc.append(acc)
    random.Random(i).shuffle(x)
    random.Random(i).shuffle(y)
  acc_max = max(features_selected_acc)
  index_for_best = features_selected_acc.index(acc_max)
  x_new = x[:,mask[index_for_best]]
  result = []
  confusion_matrix_list = []
  num = 0
  random_num = 0
  while num < times :
    x_train , x_test  , y_train , y_test = train_test_split(x_new,y,
                        test_size = test_radio)
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    accuracy = clf.score(x_test , y_test)
    result.append(accuracy)
    y_pred = clf.predict(x_test)
    matrix = confusion_matrix(y_test , y_pred).ravel()
    confusion_matrix_list.append(matrix)
    num +=1
    random_num += 1
    random.Random(num).shuffle(x_new)
    random.Random(num).shuffle(y)
  return (result ,features_selected_acc, confusion_matrix_list , mask , vector_list)

In [27]:
dir_pvalue = r'/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/pvalue_of_1year'
dir_0 = r'/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year'
dir_1 = r'/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/more1year'

In [18]:
print(parameters)
print(file_0)


['rad_MR_2D_extraction', 'rad_exampleCT', 'rad_exampleMR_3mm', 'rad_exampleMR_5mm', 'rad_exampleMR_NoResampling', 'rad_exampleVoxel', 'rad_params_yaml', 'rad_example_allShape']
['/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_MR_2D_extraction.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_exampleCT.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_exampleMR_3mm.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_exampleMR_5mm.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_exampleMR_NoResampling.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_exampleVoxel.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_example_allShape.xlsx', '/content/drive/Shareddrives/食道癌/file_arranged_for_pvalue/less1year/rad_params_yaml.xlsx']


In [28]:
file_pvalue = get_fullpath(dir_pvalue)
file_0 = get_fullpath(dir_0)
file_1 = get_fullpath(dir_1)

In [None]:
parameters = os.listdir(dir_0)
parameters = sorted(parameters)
save_path = r'/content/drive/MyDrive/analysis_result'
for i in range(len(parameters)) :
  parameters[i] = parameters[i].replace('.xlsx','_result.xlsx')
for i in range(8) :
  clf = RandomForestClassifier(n_estimators = 200)
  x , y = get_dataset(file_pvalue[i],file_0[i],file_1[i])
  print(x.shape)
  print(y.shape)
  if x.shape[1] != 0 :
    test_result , matrix_list = randomforest_test(x,y)
    df_acc = pd.DataFrame({'accuracy' : test_result})
    df_confusion = pd.DataFrame(matrix_list , columns = ['tn', 'fp', 'fn', 'tp'])
    df_result = pd.concat([df_acc , df_confusion],axis = 1)
    save_fullpath = os.path.join(save_path , parameters[i])
    df_result.to_excel(save_fullpath)


In [32]:
parameters = os.listdir(dir_0)
parameters = sorted(parameters)
save_path = r'/content/drive/MyDrive/analysis_result'
for i in range(len(parameters)) :
  parameters[i] = parameters[i].replace('.xlsx','_ls&rf_result.xlsx')
for i in range(8) :
  clf = RandomForestClassifier(n_estimators = 200)
  x , y = get_dataset(file_pvalue[i],file_0[i],file_1[i])
  print(x.shape)
  print(y.shape)
  if x.shape[1] != 0 :
    test_result ,features_selected_acc, matrix_list , mask , vector_list = Lasso_plus_Randomforest_test_v2(x,y)
    df_acc = pd.DataFrame({'accuracy' : test_result})
    df_confusion = pd.DataFrame(matrix_list , columns = ['tn', 'fp', 'fn', 'tp'])
    df_result = pd.concat([df_acc , df_confusion],axis = 1)
    save_fullpath = os.path.join(save_path , parameters[i])
    df_result.to_excel(save_fullpath)

(173, 285)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


(173, 251)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


(173, 280)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


(166, 165)
(166,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


(173, 155)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


(173, 1)
(173,)
(173, 10)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)


(173, 23)
(173,)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
