In [1]:
# Import librairies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import plotly.figure_factory as ff

In [9]:
# URL of the CSV file of INSEE data
insee_url = 'https://medical-deserts-project.s3.eu-north-1.amazonaws.com/insee_clean.csv'

# Read the CSV file from the URL into a DataFrame
insee_df = pd.read_csv(insee_url, sep = ',', encoding='utf-8')
insee_df.shape

(38590, 90)

In [10]:
# Remove useless columns
insee_df = insee_df.drop(["APL aux médecins généralistes de 65 ans et moins", "APL aux médecins généralistes de 62 ans et moins"], axis=1)

# APL column at the end of dataset
insee_df["APL aux médecins généralistes (sans borne d'âge)"] = insee_df.pop("APL aux médecins généralistes (sans borne d'âge)")
insee_df.rename(columns={"APL aux médecins généralistes (sans borne d'âge)": "APL"}, inplace=True)
insee_df.shape

(38590, 88)

In [11]:
insee_df= insee_df.drop_duplicates()
insee_df.shape

(34760, 88)

In [12]:
# Select only numerical columns for the correlation matrix
numeric_columns = insee_df.select_dtypes(include=['float64', 'int64'])

# # Calculate the correlation matrix
corr_matrix = numeric_columns.corr().round(2)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                   x=corr_matrix.columns.tolist(),
                                   y=corr_matrix.index.tolist())

fig.update_layout(height=2400, width=3200)
fig.show()

In [7]:
#to_drop = ["Nb Entreprises Secteur Services", "Nb Entreprises Secteur Commerce", "Nb Ménages", "Nb Résidences Principales", "Nb Occupants Résidence Principale", "Nb Création Commerces", "Nb Création Enteprises", "PIB Régionnal", "Nb de Commerce", "Nb Santé, action sociale", "Population en 2014 (princ)", "Pop 60-74 ans en 2014 (princ)", "Pop 75-89 ans en 2014 (princ)", "Nb Logement Secondaire et Occasionnel"]
# Remove columns to be dropped
#insee_df = insee_df.drop(columns=to_drop)

In [13]:
to_drop = ["Nb Entreprises Secteur Services", "Nb Entreprises Secteur Commerce", "Nb Ménages", "Nb Résidences Principales", "Nb Occupants Résidence Principale", "Nb Création Commerces", "Nb Création Enteprises", "PIB Régionnal", "Nb de Commerce", "Nb Santé, action sociale", "Population en 2014 (princ)", "Pop 60-74 ans en 2014 (princ)", "Pop 75-89 ans en 2014 (princ)", "Nb Logement Secondaire et Occasionnel"]

# Remove columns to be dropped
numeric_columns_filtered = insee_df.select_dtypes(include=['float64', 'int64']).drop(columns=to_drop)

# # Re-calculate the correlation matrix
corr_matrix_filtered = numeric_columns_filtered.corr().round(2)

print("New correlation matrix:")
print(corr_matrix_filtered)

fig = ff.create_annotated_heatmap(corr_matrix_filtered.values,
                                   x=corr_matrix_filtered.columns.tolist(),
                                   y=corr_matrix_filtered.index.tolist())

fig.update_layout(height=1500, width=2800)
fig.show()

New correlation matrix:
                                                Dynamique Entrepreneuriale  \
Dynamique Entrepreneuriale                                            1.00   
Dynamique Entrepreneuriale Service et Commerce                        0.99   
Synergie Médicale COMMUNE                                             0.94   
Nb Omnipraticiens BV                                                 -0.03   
Nb Infirmiers Libéraux BV                                            -0.02   
...                                                                    ...   
Pop 15 ans ou plus Ouvriers en 2014 (compl)                           0.90   
Pop 15 ans ou plus Retraités  en 2014 (compl)                         0.95   
Pop 15 ans ou plus Autres en 2014 (compl)                             0.94   
taux chômage(15-64 ans)                                               0.13   
APL                                                                   0.12   

                                       

In [14]:
# Remove columns to be dropped
insee_df = insee_df.drop(columns=to_drop)

In [15]:
insee_df.shape

(34760, 74)

In [16]:
# X, y split 
X = insee_df.loc[:, insee_df.columns != "APL"]
y = insee_df.loc[:, "APL"]

# X = X.select_dtypes(exclude=["object"])


In [17]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé H

In [18]:
# Train_test_split 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [20]:
#categorical_transformer = OneHotEncoder(drop='first')
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))])

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
        transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [21]:
# Fit and transform X_train
X_train = preprocessor.fit_transform(X_train)
# Apply on X_test
X_test = preprocessor.transform(X_test)

# Visualize X_std_train
X_train

array([[-0.02231907,  0.0064759 , -0.00263147, ...,  0.        ,
         1.        ,  0.        ],
       [-0.19154912, -0.15064951, -0.09054271, ...,  0.        ,
         0.        ,  1.        ],
       [-0.202126  , -0.148864  , -0.25284038, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.14426677, -0.1274378 ,  0.95762974, ...,  0.        ,
         1.        ,  0.        ],
       [-0.16775114, -0.13100884,  0.18671581, ...,  0.        ,
         1.        ,  0.        ],
       [-0.18626068, -0.13993642, -0.3069396 , ...,  0.        ,
         1.        ,  0.        ]])

In [22]:
# Instanciate RandomForestRegressor
rf = RandomForestRegressor()

In [23]:
rf.fit(X_train, y_train)

In [24]:
# Print R^2 scores
print("R2 score on training set : ", rf.score(X_train, y_train))
print("R2 score on test set : ", rf.score(X_test, y_test))

R2 score on training set :  0.912866883506766
R2 score on test set :  0.40418689908739636


In [None]:
#R2 score on training set :  0.9243368812005699
#R2 score on test set :  0.45301035320888927

In [43]:
#import joblib
#joblib.dump(rf, "/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl")
#joblib.load("/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl")

['/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RF.pkl']

In [31]:
# Perform grid search
rf = RandomForestRegressor()
#max_depth = 14, min_samples_leaf = 2, min_samples_split= 2, n_estimators = 80)

print("Grid search...")

# Grid of values to be tested
params = {
     'max_depth': [14, 16, 18],
     'min_samples_split': [2],
     'n_estimators': [60, 70, 80],
     'min_samples_leaf': [2] }

gridsearch = GridSearchCV(rf, param_grid= params, cv = 3, verbose = 2) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")

print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)



Grid search...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=60; total time=  27.8s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=60; total time=  26.6s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=60; total time=  27.7s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=  31.7s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=  31.9s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=  32.0s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=80; total time=  36.1s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=80; total time=  35.9s
[CV] END max_depth=14, min_samples_leaf=2, min_samples_split=2, n_estimators=80; total time=  35.5s
[CV] END max_depth=16, mi

In [None]:
#Best hyperparameters :  {'max_depth': 14, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 80}
#Best validation accuracy :  0.3914783302489721
# Perform grid search


In [32]:
print("R2 score on training set : ", gridsearch.score(X_train, y_train))
print("R2 score on test set : ",     gridsearch.score(X_test, y_test))

R2 score on training set :  0.8038804532618741
R2 score on test set :  0.40011368458260765


In [None]:
# 
#R2 score on training set :  0.6867990401991656
#R2 score on test set :  0.40804501003395977

In [42]:
#import joblib
#joblib.dump(gridsearch, "/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl")

['/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl']

In [33]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# Print MAE
print("Mean Absolute Error on training set : ", mean_absolute_error(y_train, Y_train_pred))
#print("Mean APL on training set : ", y_train.mean())
print()
print("Mean Absolute Error on test set : ", mean_absolute_error(y_test, Y_test_pred))
#print("Mean APL on test set : ", y_test.mean())
print("Standard-deviation on test set : ", y_test.std())

Predictions on training set...
...Done.
[4.34127475 3.1777289  2.58261859 ... 2.57657811 3.62463124 3.29204212]

Predictions on test set...
...Done.
[2.71324374 3.48549072 2.46462752 ... 3.04651763 3.5475291  3.9747504 ]

Mean Absolute Error on training set :  0.42683408076862006

Mean Absolute Error on test set :  0.7366971623255404
Standard-deviation on test set :  1.2669167974941518


ERROR of 0.72 on the prediction, NOT GOOD MODEL of PREDICTION

In [34]:
column_names = []
for name, step, features_list in preprocessor.transformers_: # loop over steps of ColumnTransformer
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = step.get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Dynamique Entrepreneuriale', 'Dynamique Entrepreneuriale Service et Commerce', 'Synergie Médicale COMMUNE', 'Nb Omnipraticiens BV', 'Nb Infirmiers Libéraux BV', 'Nb dentistes Libéraux BV', 'Nb pharmaciens Libéraux BV', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb propriétaire', 'Nb Logement', 'Nb Résidences Secondaires', 'Nb Log Vacants', 'Nb Entreprises Secteur Construction', 'Nb Entreprises Secteur Industrie', 'Nb Création Industrielles', 'Nb Création Construction', 'Nb Création Services', 'Moyenne Revenus Fiscaux Départementaux', 'Moyenne Revenus Fiscaux Régionaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Cadre Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'R

In [35]:
# Create a pandas DataFrame
feature_importance = pd.DataFrame(index = column_names, data = gridsearch.best_estimator_.feature_importances_, columns=["feature_importances"])
feature_importance = feature_importance.sort_values(by = 'feature_importances')

In [36]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(height=2000, width=2500, showlegend = False, margin = {'l': 50}, ) # to avoid cropping of column names
fig.show()

According to the forward selection algorithm, the following features should be kept: 
['Dynamique Entrepreneuriale', 'Densité Médicale BV', 'Score équipement de santé BV', 'Indice Démographique', 'Nb Log Vacants', 'Moyenne Revenus Fiscaux Départementaux', 'Dep Moyenne Salaires Horaires', 'Dep Moyenne Salaires Prof Intermédiaire Horaires', 'Dep Moyenne Salaires Employé Horaires', 'Dep Moyenne Salaires Ouvrié Horaires', 'Reg Moyenne Salaires Horaires', 'Reg Moyenne Salaires Cadre Horaires', 'Reg Moyenne Salaires Prof Intermédiaire Horaires', 'Reg Moyenne Salaires Employé Horaires', 'Valeur ajoutée régionale', 'Nb Hotel', 'Capacité Hotel', 'Nb Camping', 'Taux Propriété', 'Dynamique Démographique INSEE', 'Capacité Fisc', 'Moyenne Revnus fiscaux', 'Nb Education, santé, action sociale', 'Score Croissance Entrepreneuriale', 'latitude', 'longitude', 'Pop 15-29 ans en 2014 (princ)', 'Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl)', 'Pop 15 ans ou plus Ouvriers en 2014 (compl)', 'Po

In [None]:
to_keep = ["APL", "Dynamique Entrepreneuriale Service et Commerce", "latitude", "Densité Médicale BV", "Taux Propriété", "longitude", "Synergie Médicale COMMUNE", "taux chômage(15-64 ans)", "Nb Résidences Secondaires", \
            "Pop 60-74 ans en 2014", "Capacité Fisc", "Pop 15 ans ou plus Prof. intermédiaires en 2014 (compl)", "Pop 15 ans ou plus Employés en 2014 (compl)", "Nb Omnipraticiens BV", "Nb Log Vacants", "Pop 15-29 ans en 2014 (princ)", \
            "Reg Moyenne Salaires Employé Horaires", "Pop 15 ans ou Retraités en 2014 (compl)", "Pop 15 ans ou plus Agriculteurs exploitants en 2014 (compl)", "Nb Infirmiers Libéraux BV", "Pop 15 ans ou plus Cadres, Prof. intel. sup. en 2014 (compl)", \
            "Pop 75-89 ans en 2014 (princ)", "Pop 45-59 ans en 2014 (princ)", "Capacité Hotel", "Pop 15 ans ou plus Ouvriers en 2014 (compl)", "Pop 0-14 ans en 2014 (compl)", "Nb de Services aux particuliers", "Pop 30-44 ans en 2014 (princ)", \
            "Moyenne Revnus fiscaux", "Dep Moyenne Salaires Cadre Horaires", "Nb propriétaire", "Pop 15 ans ou plus Autres en 2014 (compl)", "Dep Moyenne Salaires Employé Horaires", "Pop 15 ans ou plus Artisans, Comm., Chef entr. en 2014 (compl)", \
            "Nb Entreprises Secteur Construction", "Dep Moyenne Salaires Prof Intermédiaire Horaires"]

insee_df = insee_df.loc[:, insee_df.columns.isin(to_keep)]
insee_df.head()


#Select only numerical columns for the correlation matrix
numeric_columns = insee_df.select_dtypes(include=['float64', 'int64'])

# # Calculate the correlation matrix
corr_matrix = numeric_columns.corr().round(2)

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                   x=corr_matrix.columns.tolist(),
                                   y=corr_matrix.index.tolist())

fig.update_layout(height=1200, width=1800)
fig.show()

In [14]:
#import joblib
#rfgs = joblib.load("/Users/wenhajindomeni/Desktop/JEDHA/FULLSTACK/FINAL_PROJECT/Medical_deserts/RFGS.pkl")

In [25]:
model = RandomForestRegressor(max_depth = 14, min_samples_leaf = 2, min_samples_split= 2, n_estimators = 80)

In [26]:
from sklearn.feature_selection import  SequentialFeatureSelector
feature_selector =  SequentialFeatureSelector(model, n_features_to_select = 10)
feature_selector.fit(X_train, y_train)
features_list = X.columns
best_features = features_list[feature_selector.support_]
print("According to the forward selection algorithm, the following features should be kept: ")
print(feature_selector.support_.tolist())

In [None]:
X_best = X.loc[:, best_features]

In [None]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X_best, y, test_size=0.2, random_state=0)
print("...Done.")
print()

# Preprocessing
print("Preprocessing X_train...")
print(X_train.head())
print()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array

print("Preprocessing X_test...")
print(X_test.head())
print()
X_test = scaler.transform(X_test) # don't fit again !
print("...Done!")
print(X_test[0:5,:]) # X_train is now a numpy array

# Train model
print("Train model...")
regressor = RandomForestRegressor()
regressor.fit(X_train, Y_train)
print("...Done.")

# Print R^2 scores
print("R2 score on training set : ", regressor.score(X_train, Y_train))
print("R2 score on test set : ", regressor.score(X_test, Y_test))

In [None]:
column_names = preprocessor.get_feature_names_out()

In [None]:
# Get the indices of the selected features
selected_feature_indices = feature_selector.get_support(indices=True)

# Use these indices to filter the corresponding column names
selected_column_names = [column_names[i] for i in selected_feature_indices]

# Print the names of the selected features
print("The names of the selected features are:", selected_column_names)

In [None]:
# Initialize the filtered list
temp_final_features = []
final_features = []

# Iterate through the example list
for element in selected_column_names:
    if element.startswith("num__"):
        # If the element starts with "num__", keep it as is
        temp_final_features.append(element)
    elif element.startswith("cat__"):
        # If the element starts with "cat__", find the index of the last underscore
        last_underscore_index = element.rfind('_')
        # Keep only the part of the element up to the last underscore
        filtered_element = element[:last_underscore_index]
        # Add the filtered element to the filtered list
        temp_final_features.append(filtered_element)

for element in temp_final_features:
    # Checkif the element is already in the list
    if element not in final_features:
        # If the element is not in the list, add it
        final_features.append(element)

final_features = [name[5:] for name in final_features]

# Print the filtered list
print("Filtered List:", final_features)

In [None]:
insee_best = insee_df[final_features + ["APL aux médecins généralistes (sans borne d'âge)"]]

In [None]:
print('Number of rows :', insee_best.shape[0])
print('Number of columns :', insee_best.shape[1])
print()

# Show first rows of the dataset
print('First rows of the dataset :')
display(insee_best.head())
print()

# Dataset statistics
print('Basics statistics :')
summary_stats_all = insee_best.describe(include='all')
display(summary_stats_all)
print()

# Missing values percentage
missing_percentages = (insee_best.isna().mean() * 100).round(2)
print('Percentage of missing values: ')
print(missing_percentages)

In [None]:
insee_best.rename(columns={"APL aux médecins généralistes (sans borne d'âge)": "APL"}, inplace=True)

In [None]:
# Separate target variable Y from features X

target_variable = "APL"

X = insee_best.drop(target_variable, axis = 1)
Y = insee_best.loc[:,target_variable]

X.head()

In [None]:
# Divide dataset Train set & Test set

print("Dividing into train and test sets...")
X_train_best, X_test_best, Y_train_best, Y_test_best = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")

# Fit and transform X_train
X_train_best = preprocessor.fit_transform(X_train_best)

# Apply on X_test
X_test_best  = preprocessor.transform(X_test_best)

# Visualize X_train
X_train_best

In [None]:
# Perform grid search
rf_bestfeatures = RandomForestRegressor(max_depth = 14, min_samples_leaf = 2, min_samples_split= 2, n_estimators = 80)

rf_bestfeatures.fit(X_train_best, Y_train_best)

# Print R^2 scores
print("R2 score on training set : ", rf_bestfeatures.score(X_train_best, Y_train_best))
print("R2 score on test set : ",     rf_bestfeatures.score(X_test_best, Y_test_best))

In [217]:
from sklearn.decomposition import PCA

# Instanciate PCA with 3 components
pca = PCA(n_components=3)

# Fit transform X_std_train
X_opt_train = pca.fit_transform(X_train_best)

# Apply on X_std_test
X_opt_test = pca.transform(X_test_best)

In [None]:
PC1 = X_opt_train[:, 0]
PC2 = X_opt_train[:, 1]
PC3 = X_opt_train[:, 2]

# Convert PC into a DataFrame
PC = pd.DataFrame(data=X_opt_train, columns=["PC1", "PC2", "PC3"])
# PC Head
PC.head()

In [None]:
# Use pca.explained_variance_ratio_
print("Explained Variance ration per PC: {}".format(pca.explained_variance_ratio_))
print("Total explained variance ratio: {}%".format(pca.explained_variance_ratio_.sum()))

In [None]:
# Fit the RF bestfeatures on the train set where the PCA was applied and checkout the score on the test
rf_bestfeatures.fit(X_opt_train, Y_train_best)
# Print R^2 scores
print("R2 score on training set fit on PCA: ", rf_bestfeatures.score(X_opt_train, Y_train_best))
print("R2 score on test set fit on PCA: ",     rf_bestfeatures.score(X_opt_test, Y_test_best))

In [None]:
# Import plotly.express and plotly.graph_objects
import plotly.express as px 
import plotly.graph_objects as go

# Use plotly express to plot train data
fig = px.scatter_3d(PC, x="PC1", y="PC2", z="PC3")

# Add trace with test data 
fig.add_trace(go.Scatter3d(x=X_opt_test[:, 0], 
                           y=X_opt_test[:, 1], 
                           z=X_opt_test[:, 2],
                           mode="markers",
                           name="test"
                          ))

# Render on notebook
fig.show()

In [None]:
#Example on train set coloring by SalePrice
df_plot = pd.DataFrame(X_opt_train)
df_plot.columns = ['PC1', 'PC2', 'PC3']
df_plot["APL"] = list(Y_train_best)

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3', color="APL")
fig.show()