In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
radiomic_features_df = pd.read_table('Data\clinical_radiomics.tsv')

In [8]:
#Clean up
radiomic_features_df = radiomic_features_df[radiomic_features_df.OS_cens == '0']
radiomic_features_df["OS_result"] = radiomic_features_df["OS"] > 2.15 if 'long' else 'short'
radiomic_features_df = radiomic_features_df.drop(columns=["Image","Mask","type","CT_drugs","Treatment", "intent", "Metastasis_location", 
                                                          "diagnostics_Versions_PyRadiomics","diagnostics_Versions_Numpy",
                                                          "diagnostics_Versions_SimpleITK","diagnostics_Versions_PyWavelet",
                                                          "diagnostics_Versions_Python","diagnostics_Configuration_Settings","diagnostics_Configuration_EnabledImageTypes",
                                                          "diagnostics_Image.original_Hash", "diagnostics_Image.original_Dimensionality", "diagnostics_Image.original_Spacing",
                                                          "diagnostics_Image.original_Minimum", "OS_cens","diagnostics_Image.original_Minimum", "OS", "diagnostics_Mask.original_Hash",
                                                          "diagnostics_Mask.original_Spacing", "diagnostics_Mask.original_BoundingBox",	"diagnostics_Mask.original_VoxelNum", "N",
                                                          "diagnostics_Mask.original_Size",  "diagnostics_Image.original_Size", "diagnostics_Mask.original_CenterOfMassIndex",	"diagnostics_Mask.original_CenterOfMass",
                                                          "CT_effect", "zubrod_score", "T", "HT", "PLT", "RBC", "WBC", "ID", "MFS","MFS_cens","CT_cycles", "histopathology", "location",
                                                           "MIP","RT_fractional_dose", "HB"])


In [9]:
X = radiomic_features_df.loc[:,radiomic_features_df.columns != "OS_result"]  #independent columns
y = radiomic_features_df.loc[:,radiomic_features_df.columns == "OS_result"]    #target column i.e price range

df1 = pd.get_dummies(X['sex'])
X = pd.concat([X, df1], axis=1).reindex(X.index)
X.drop('sex', axis=1, inplace=True)

y = np.ravel(y)
bestfeatures = SelectKBest(score_func=mutual_info_classif, k=15)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

In [10]:
features = featureScores.nlargest(15,'Score')['Specs']

five_first_features = X[features [:5]] #first 5 features
five_ten_features = X[features [:10]] #first 10 features
fifteen_features = X[features] #15 features

In [11]:
print("\n5 features")
X_train,X_test,y_train,y_test = train_test_split(five_first_features,y,test_size = 0.25, random_state = 33)
model = LogisticRegression(random_state=0,max_iter=4000,solver="liblinear")
model.fit(X_train,y_train)
predicted_probs = model.predict_proba(X_test)
print(predicted_probs[0:3])
print(model.score(X_test,y_test))


5 features
[[0.99616897 0.00383103]
 [0.75452269 0.24547731]
 [0.55609094 0.44390906]]
0.9473684210526315


In [12]:
print("\n10 features")
X_train,X_test,y_train,y_test = train_test_split(five_ten_features,y,test_size = 0.25, random_state = 33)
model = LogisticRegression(random_state=0,max_iter=4000,solver="liblinear")
model.fit(X_train,y_train)
predicted_probs = model.predict_proba(X_test)
print(predicted_probs[0:3])
print(model.score(X_test,y_test))


10 features
[[0.99616897 0.00383103]
 [0.75452269 0.24547731]
 [0.55609094 0.44390906]]
0.9473684210526315


In [13]:
print("\n15 features")
X_train,X_test,y_train,y_test = train_test_split(fifteen_features,y,test_size = 0.25, random_state = 33)
model = LogisticRegression(random_state=0,max_iter=4000,solver="liblinear")
model.fit(X_train,y_train)
predicted_probs = model.predict_proba(X_test)
print(predicted_probs[0:3])
print(model.score(X_test,y_test))


15 features
[[0.99616897 0.00383103]
 [0.75452269 0.24547731]
 [0.55609094 0.44390906]]
0.9473684210526315
