In [None]:
pip install numpy

In [None]:
pip install pandas 

In [None]:
pip install gplearn

In [None]:
pip install matplotlib

In [None]:
import numpy as np
import pandas as pd
from gplearn.genetic import SymbolicRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [None]:
dataset_survival = pd.read_csv('/database/', encoding="latin1", low_memory=False)
dataset_survival.drop(columns=dataset_survival.columns[0], axis=1, inplace=True)
dataset_survival.head(10)

In [None]:
dummy_cols = dataset_survival.select_dtypes(include = ['object']).columns.to_list()

In [None]:
train_data = pd.get_dummies(data=dataset_survival, columns=dummy_cols, drop_first = True, dtype = int)
train_data.head(10)

In [None]:
# Create a DataFrame
# df = pd.DataFrame(data)
df = pd.DataFrame(dataset_survival)

In [None]:
df.isnull().sum(axis=1)

In [None]:
df.columns

In [None]:
df = df.fillna(0)  # Replace 0 with any other constant as needed

In [None]:
total_nulls = df.isnull().sum().sum()
print(f"Total null values in the DataFrame: {total_nulls}")

In [None]:
# Define your target variable and features
y = train_data['DAYS']
X = train_data.drop('DAYS', axis=1)

In [None]:
# Initialize the DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=42)

In [None]:
# Fit the model
dtree.fit(X, y)

In [None]:
# Get feature importances
importances = dtree.feature_importances_

In [None]:
# Create a DataFrame for easier handling
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

In [None]:
# Sort the DataFrame by importance in descending order and select top 5
top_5_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(10)

# Display the top 5 features
print(top_5_features)

In [None]:
if 'REC_PRETX_DIAL' in df.columns:
    print("Column 'column_name' exists in the DataFrame.")
else:
    print("Column 'column_name' does not exist in the DataFrame.")


In [None]:
col_name = ['REC_CMV_STAT','DON_HIST_HYPERTEN', 'DON_RACE', 'CAN_LAST_SRTR_PEAK_PRA','REC_AGE_AT_TX_CAT', 'REC_MED_COND.5', 'CAN_PEPTIC_ULCER', 'DON_HIST_DIAB', 'REC_MED_COND.7', 'REC_PRETX_DIAL']
target_col = ['DAYS'] 
X = df[col_name]
y = df[target_col]  # y contains only the 'DAYS' column

x_num = df[col_name].to_numpy()
y_num = df[target_col].to_numpy()  # y contains only the 'DAYS' column

In [None]:
import matplotlib.pyplot as plt

# plot the histogram of the 'WEEKS' column
df[target_col].hist(bins=50, figsize=(10, 5))  # Adjust bins and figsize as needed

# Label your axes and give the plot a title
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.title('Histogram of Survival Time in Weeks')

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Using seaborn's distplot function which includes a density plot (kde)
sns.distplot(df['DAYS'], hist=False, kde=True, 
             kde_kws={'shade': True, 'linewidth': 3})

# Alternatively, using pandas to create a density plot
# df['WEEKS'].plot(kind='density', linewidth=2)

# Set labels and title
plt.xlabel('Days')
plt.ylabel('Density')
plt.title('Density Plot of Survival Time in Weeks')

# Display the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate Q1, Q3 and IQR
Q1 = df['DAYS'].quantile(0.25)
Q3 = df['DAYS'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the data
filtered_df = df[(df['DAYS'] >= lower_bound) & (df['DAYS'] <= upper_bound)]

# Plot the filtered data
sns.kdeplot(filtered_df['DAYS'], fill=True)
plt.title('Density Plot of Survival Time in Weeks without Outliers')
plt.xlabel('Days')
plt.ylabel('Density')
plt.show()

In [None]:
#col_name = ['GL','REC_BMI','REC_AGE_AT_TX', 'DON_BMI', 'REC_BMI', 'DON_AGE', 'DON_RACE', 'CAN_RACE']
col_name = ['REC_CMV_STAT','DON_HIST_HYPERTEN', 'DON_RACE', 'CAN_LAST_SRTR_PEAK_PRA','REC_AGE_AT_TX_CAT', 'REC_MED_COND.5', 'CAN_PEPTIC_ULCER', 'DON_HIST_DIAB', 'REC_MED_COND.7', 'REC_PRETX_DIAL']
df['YEARS'] = df['DAYS']/365.0
target_col = ['YEARS'] 
X = df[col_name]
y = df[target_col]  # y contains only the 'DAYS' column

In [None]:
# Perform one-hot encoding for categorical variables
X_encoded = pd.get_dummies(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
import pandas as pd

# Define a function to hash non-numeric values
def hash_non_numeric(x):
    if isinstance(x, str) and not x.isnumeric():
        return hash(x) # You can choose a different hash function if needed
    else:
        return x

# Loop through the columns to process
for col in col_name + target_col:
    df[col] = df[col].apply(hash_non_numeric)

# Convert the hashed values to numeric
for col in col_name:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # This will replace hashed values with NaN

df = df.dropna(subset=col_name + target_col)  # Drop rows with NaN values if needed
df.head(5)



In [None]:
x_num = df[col_name].to_numpy()
y_num = df[target_col].to_numpy()  

In [None]:
from gplearn.fitness import make_fitness

def _mape(y, y_pred, w):
   
    diffs = np.abs(np.divide((np.maximum(0.001, y) - np.maximum(0.001, y_pred)),
                             np.maximum(0.001, y)))
    return 100. * np.average(diffs, weights=w)

def _mse(y, y_pred, w):
    return mean_squared_error(y, y_pred, sample_weight=w)
    

mape = make_fitness(function=_mape, greater_is_better=False)
mse  = make_fitness(function=_mse, greater_is_better=False)

# Create a SymbolicRegressor and fit the model to the training data
est_gp = SymbolicRegressor(population_size=50,
                           generations=1000, stopping_criteria=4,
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1, 
                           n_jobs=5,
                           parsimony_coefficient=0.01, random_state=0, metric=mse)

In [None]:
# Fit the model
op = est_gp.fit(x_num, y_num)
op

In [None]:
# Check if the model is fitted
if est_gp._program is None:
    print("Model is not fitted. Please fit the model before visualization.")
else:
    # Predict on the testing set
    y_pred = est_gp.predict(x_num)
    print(y_pred)


In [None]:
first_two_rows = df.iloc[:2]
ac_predict = first_two_rows[target_col]
print('acp',ac_predict)
first_two_rows = first_two_rows[col_name]
print(first_two_rows[col_name])

In [None]:
predictions = est_gp.predict(first_two_rows)
predictions

In [None]:
# Evaluate the model (e.g., using mean squared error)
mse = mean_squared_error(y_num, y_pred)
print('Mean Squared Error:', mse)