# RF Deployment on Validated Data only

In [4]:
import pandas as pd
import pickle

In [5]:
from sklearn.model_selection import train_test_split

# Read in the data
validated_data = pd.read_csv('Data/validated_data_only.csv',index_col=0)

# Read in the selected features from RFE
selected_features = pd.read_csv('Data/RFE_64_selected_features.csv')['Features'].values

X = validated_data[selected_features]
y = validated_data['Hit'].copy()

In [6]:
discovery_data = pd.read_csv('Data/Discovery_Data.csv',index_col=0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# imblearn modules
from imblearn.pipeline import Pipeline as Pipeline
from imblearn.over_sampling import RandomOverSampler

best_params = {'ccp_alpha': 0.006370568744783853,
 'class_weight': 'balanced_subsample',
 'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 0.1,
 'max_leaf_nodes': 597,
 'min_impurity_decrease': 0.0,
 'min_samples_split': 8,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 288,
#  "random_state": 0
 }


rf_pipeline = Pipeline([
    # ("scaler", MinMaxScaler()),
    ("scaler", StandardScaler()),
    ("sampler", RandomOverSampler(sampling_strategy='auto')),
    ("model",RandomForestClassifier(**best_params,n_jobs=-1)),
]
)

rf_pipeline.fit(X,y)

In [None]:
# with open('Fitted RF Model on Validated Data.pickle','wb') as f:
#     pickle.dump(rf_pipeline, f)

"""
July 23, 2024: I reran the code, so the predictions are slightly different now due to the random nature of Random Forest; 

Use the pickle from the first time it was deployed
"""

'\nJuly 23: I reran the code, so the predictions are slightly different now due to the random nature of Random Forest; \n\nUse the pickle from the first time it was deployed\n'

# Deployment

In [10]:
discovery_y_pred = rf_pipeline.predict(discovery_data[selected_features])
discovery_y_proba = rf_pipeline.predict_proba(discovery_data[selected_features])
discovery_y_log_proba = rf_pipeline.predict_log_proba(discovery_data[selected_features])

In [11]:
discovery_y_pred = pd.Series(discovery_y_pred,index=discovery_data.index.values, name='y_pred')
discovery_y_proba = pd.Series(discovery_y_proba[:,1],index=discovery_data.index.values, name='y_proba')
discovery_y_log_proba = pd.Series(discovery_y_log_proba[:,1],index=discovery_data.index.values, name='y_proba')

In [12]:
predictions = pd.concat([discovery_y_pred, discovery_y_proba],axis=1)

### Examining the Predictions

In [13]:
predictions = predictions.sort_values(by='y_proba',ascending=False)
display(predictions)

Unnamed: 0,y_pred,y_proba
Decursinol angelate,True,0.887183
Beclometasone,True,0.871596
Caudatin,True,0.856374
Brevilin A,True,0.855375
Microhelenin C,True,0.855375
...,...,...
Cyclo(-RGDfK) TFA,False,0.085186
Pyrotinib dimaleate,False,0.080404
Pyridostatin TFA,False,0.071923
Cyclo(RGDyK),False,0.069211


In [14]:
neg_log_predictions = pd.concat([discovery_y_pred, -discovery_y_log_proba],axis=1).sort_values(by='y_proba',ascending=False)
neg_log_predictions

Unnamed: 0,y_pred,y_proba
Nemifitide acetate(173240-15-8 free base),False,2.842204
Cyclo(RGDyK),False,2.670597
Pyridostatin TFA,False,2.632161
Pyrotinib dimaleate,False,2.520690
Cyclo (-RGDfK),False,2.462923
...,...,...
Brevilin A,True,0.156215
Microhelenin C,True,0.156215
Caudatin,True,0.155048
Beclometasone,True,0.137429


In [15]:
import plotly.express as px

fig = px.bar(predictions,y='y_proba',color='y_pred', color_discrete_sequence=['orange','#42b6f5'],title="Predicted Probabilities",labels={'y_proba':'Probabilities','y_pred':'>50% proba'})
fig.show()

In [16]:
import plotly.express as px

fig = px.bar(neg_log_predictions,y='y_proba',color='y_pred', color_discrete_sequence=['orange','#42b6f5'],title="Predicted -Log Probabilities",labels={'y_proba':'Probabilities','y_pred':'>50% proba'})
fig.show()

In [17]:
import plotly.express as px

fig = px.histogram(neg_log_predictions,color='y_pred', color_discrete_sequence=['orange','#42b6f5'],title="Predicted -Log Probabilities",labels={'y_proba':'Probabilities','y_pred':'>50% (normal) proba'})
fig.show()

In [None]:
# Saving the predictions

# predictions.to_excel('Results/Predicted Probabilities from RF Trained on Full Validated Data Only.xlsx')
# predictions.to_csv('Predicted Probabilities from RF Trained on Full Validated Data Only.csv')

# UMAP of the predictions (on new data) + old labeled hits

Prepare data for UMAP

In [18]:
# Scale the discovert data for UMAP 
discovery_data = discovery_data[selected_features] # Apply the selected features
discovery_data_scaled = pd.DataFrame(StandardScaler().fit_transform(discovery_data),columns=discovery_data.columns,index=discovery_data.index.values)

# Get and scale the labeled validated data for UMAP
X_validated = validated_data[selected_features] # validated_data already loaded in earlier
X_validated_scaled = pd.DataFrame(StandardScaler().fit_transform(X_validated),columns=X_validated.columns,index=X_validated.index.values)

Combined the datasets so they can be mapped onto the UMAP projection together

In [19]:
combined_data = pd.concat([discovery_data_scaled,X_validated_scaled],axis=0)

Apply UMAP

In [20]:
import plotly.express as px
import plotly.graph_objs as go
import umap

# Initialize UMAP
n_neighbors=13
umap_model = umap.UMAP(n_neighbors=n_neighbors, n_components=2, n_jobs=-1, random_state=0)

# UMAP on combined data
combined_data_2d = umap_model.fit_transform(combined_data[selected_features])
combined_data_2d = pd.DataFrame(combined_data_2d,index=combined_data.index)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



And the new prediction labels and old validated labels

In [30]:
combined_data_2d['Predictions'] = discovery_y_pred # get the predictions into the combined df
combined_data_2d["Probabilities"] = predictions["y_proba"]
combined_data_2d['Hit'] = validated_data['Hit'] # get the Hit label of our validated data


In [31]:
### Separating the different groups into separate dataframes for plotting different colors

# extracting the new data from the combined df
c_new_data = combined_data_2d[~combined_data_2d['Predictions'].isna()] # only the new data has predictions on them

# extracting the new data from the combined df (predicted true)
c_new_data_positives = combined_data_2d[combined_data_2d['Predictions'] == True] 

# extracting the old data from the combined df (positives)
c_old_positives = combined_data_2d[combined_data_2d['Hit']==True]

# extracting the old data from the combined df (validated negatives)
c_old_negatives = combined_data_2d[combined_data_2d['Hit']==False]

# Plot with Plotly
fig = go.Figure()

# Add the dots for all of the new data
fig.add_trace(go.Scatter(
    x=c_new_data[0],
    y=c_new_data[1],
    mode='markers',
    marker=dict(color='steelblue', size=5, opacity=0.4),
    name='New data',
    text=c_new_data.index.values
))

# Add the dots for new data predicted to be positive
fig.add_trace(go.Scatter(
    x=c_new_data_positives[0],
    y=c_new_data_positives[1],
    mode='markers',
    marker=dict(color='pink', size=5, opacity=0.4),
    name='Predicted hits',
    text=c_new_data_positives.index.values
))


# Add the old validated positives
fig.add_trace(go.Scatter(
    x=c_old_positives[0],
    y=c_old_positives[1],
    mode='markers',
    marker=dict(color='gold', size=5, line=dict(width=1,
                                        color='DarkSlateGrey')),
    name='Old labeled positives',
    text=c_old_positives.index.values
))

# Add the old validated negatives negatives
fig.add_trace(go.Scatter(
    x=c_old_negatives[0],
    y=c_old_negatives[1],
    mode='markers',
    marker=dict(color='black', size=5),
    name='Old validated negatives',
    text=c_old_negatives.index.values
))


fig.update_layout(
    title='RF Trained on Validated Data Only<br>(UMAP Projection of New data + old validated data)<br>',
    xaxis_title='UMAP 1',
    yaxis_title='UMAP 2',
    showlegend=True,
    width=1000,
    height=900
)

fig.show()


### Looking at the top 50 predicted hits on the map only

In [39]:
predictions.nlargest(100,'y_proba')

Unnamed: 0,y_pred,y_proba
Decursinol angelate,True,0.887183
Beclometasone,True,0.871596
Caudatin,True,0.856374
Brevilin A,True,0.855375
Microhelenin C,True,0.855375
...,...,...
Methylnaltrexone bromide,True,0.723020
Nylestriol,True,0.721827
SJ000291942,True,0.721336
Rosmanol,True,0.718968


In [41]:
### Separating the different groups into separate dataframes for plotting different colors

# extracting the new data from the combined df
c_new_data = combined_data_2d[~combined_data_2d['Predictions'].isna()] # only the new data has predictions on them

# extracting the new data from the combined df (predicted true)  ### Only get the top 50, >0.778083 proba
c_new_data_positives = combined_data_2d[combined_data_2d['Probabilities'] > 0.718888] 

# extracting the old data from the combined df (positives)
c_old_positives = combined_data_2d[combined_data_2d['Hit']==True]

# extracting the old data from the combined df (validated negatives)
c_old_negatives = combined_data_2d[combined_data_2d['Hit']==False]

# Plot with Plotly
fig = go.Figure()

# Add the dots for all of the new data
fig.add_trace(go.Scatter(
    x=c_new_data[0],
    y=c_new_data[1],
    mode='markers',
    marker=dict(color='steelblue', size=5, opacity=0.4),
    name='New data',
    text=c_new_data.index.values
))

# Add the dots for new data predicted to be positive
fig.add_trace(go.Scatter(
    x=c_new_data_positives[0],
    y=c_new_data_positives[1],
    mode='markers',
    marker=dict(color='pink', size=5, opacity=0.6),
    name='Predicted hits',
    text=c_new_data_positives.index.values
))


# Add the old validated positives
fig.add_trace(go.Scatter(
    x=c_old_positives[0],
    y=c_old_positives[1],
    mode='markers',
    marker=dict(color='gold', size=5, line=dict(width=1,
                                        color='DarkSlateGrey')),
    name='Old labeled positives',
    text=c_old_positives.index.values
))

# Add the old validated negatives negatives
fig.add_trace(go.Scatter(
    x=c_old_negatives[0],
    y=c_old_negatives[1],
    mode='markers',
    marker=dict(color='black', size=5),
    name='Old validated negatives',
    text=c_old_negatives.index.values
))


fig.update_layout(
    title='RF Trained on Validated Data Only<br>(UMAP Projection of New data + old validated data)<br>',
    xaxis_title='UMAP 1',
    yaxis_title='UMAP 2',
    showlegend=True,
    width=1000,
    height=900
)

fig.show()
