In [1]:
### Exercice 1: Get the DataSet and convert it from CSV to pd.DataFrame:
import pandas as pd

def convert_csv_to_df() -> pd.DataFrame:
    # 1) Specify the path to the CSV file:
    file_path = "/Users/romainkuhne/Documents/pandas_interview_training/myenv/Pandas_interview_prep/heart_disease_health_indicators_BRFSS2015.csv"
    # 2) Load the CSV into a dataFrame
    df = pd.read_csv(file_path)
    # 3) Return the converted DataFrame:
    return df

df = convert_csv_to_df()
print(df.head())


   HeartDiseaseorAttack  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0                   0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1                   0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2                   0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3                   0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4                   0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   Diabetes  PhysActivity  Fruits  ...  AnyHealthcare  NoDocbcCost  GenHlth  \
0       0.0           0.0     0.0  ...            1.0          0.0      5.0   
1       0.0           1.0     0.0  ...            0.0          1.0      3.0   
2       0.0           0.0     1.0  ...            1.0          1.0      5.0   
3       0.0           1.0     1.0  ...            1.0          0.0      2.0   
4       0.0           1.0     1.0  ...            1.0          0.0      2.0   

   MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  I

In [9]:
# Exercice 2: Utilise Random Forest to Determine which features is the most important for the occurence of Heart Disease:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score

# 2.1) Define the feature (Y) & label set (X):
X = df.drop(["HeartDiseaseorAttack", "Stroke"], axis=1) # Remove 2 labels = {HeartDiseaseOccurence, Stroke}:
Y_hd = df["HeartDiseaseorAttack"] # Label set only includes the occurence of heart disease {0:no occurence, 1:occurence}:


# 2.2) Create a helper function that performs Grid Search to fine tune hyper parameters:
async def perform_grid_search(featureSet: pd.DataFrame, labelSet: pd.Series):
    # 2.2.1) Instantiate the model:
    model = RandomForestClassifier(random_state=42)
    # 2.2.2) Define the hypeparameter grid:
    param_grid = {
        'n_estimators': [50,100,200],
        'max_depth': [5, 10, 15],
        'criterion': ['gini', 'entropy']
    }
    # 2.2.3) Define the Stratified K-fold Cross Dalidation:
    stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    # 2.2.4) Configure Grid Search with Stratified Cross Validation:
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=stratified_kfold, scoring='f1')
    # 2.2.5) Fit the Grid Search to find the best hyperparameters:
    # - iterates through all combinations of hyperparameters in param_grid.
    # A) Splits the data using the stratified k-fold method (cv=stratified_kfold):
    # B) Trains the model (RandomForestClassifier) on the training portion of each fold:
    # C) Evaluates the F1 score on the validation portion of each fold:
    grid_search.fit(featureSet, labelSet)
    # 2.2.5) Get the best model after performing grid search that max F1 score:
    best_model = grid_search.best_estimator_
    # Return the best model
    return best_model



# 2.3) Create a function that will use random forest to determine the participation of each features:
# Feature importance in Random Forest represents how much each feature contributes to making accurate predictions:
# Computed by how much a feature improves the model's split criteria (e.g., Gini impurity or entropy) across all trees in the forest:
# The contributions of each feature are summed up for all trees.
# These sums are normalized to compute the relative importance of each feature.
async def compute_feature_importance_hd(featureSet: pd.DataFrame, labelSet: pd.Series) -> pd.DataFrame:
    # 2.3.1) Call helper function to perform Stratified Grid Search W/ Cross Validation:
    best_model = await perform_grid_search(featureSet, labelSet)
    # 2.3.2) Train the model on the entire feature set
    best_model.fit(featureSet, labelSet)
    # 2.3.3) Evaluates how each feature contributes to the model’s decision-making process.
    # For each split in the Random Forest trees:
    # It checks how much splitting on a particular feature reduces the chosen impurity metric (either Gini impurity or entropy
    feature_importances = best_model.feature_importances_
    # 2.3.4) Create a DataFrame of feature importances:
    feature_importance_df = pd.DataFrame({
        'Feature': featureSet.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    return feature_importance_df

# Call the function:
feature_participation_hd_df = await compute_feature_importance_hd(X, Y_hd)
print(feature_participation_hd_df)

              Feature  Importance
17                Age    0.140224
12            GenHlth    0.124162
3                 BMI    0.113729
14           PhysHlth    0.081048
19             Income    0.070460
0              HighBP    0.060544
13           MentHlth    0.059371
18          Education    0.051050
15           DiffWalk    0.047713
1            HighChol    0.044582
16                Sex    0.039859
5            Diabetes    0.038524
4              Smoker    0.024365
7              Fruits    0.022423
6        PhysActivity    0.021112
8             Veggies    0.021041
11        NoDocbcCost    0.016234
10      AnyHealthcare    0.009622
9   HvyAlcoholConsump    0.008639
2           CholCheck    0.005298


In [1]:
# Exercice 3: Fetch financial data from a stock listed in the S&P500 for a timestamp of 5 years:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date

def fetch_raw_data(ticker="GOOG") -> pd.DataFrame:
    # 3.1) Define the timeframe from where to gather data:
    endDate = date.today() # format: 2025-01-02:
    startDate = (endDate - pd.DateOffset(years=5)).date() # format: 2020-01-02
    try: # 3.2) Attempt to fetch data:
        df = yf.download(tickers=ticker, start=startDate, end=endDate, progress=False)
        
    except Exception as e:
        print(f"Couldn't fetch data for {ticker}: str({e}).")
        return pd.DataFrame

    # 3.3) Assess if the DataFrame has a multi index, if true remove it:
    if isinstance(df.columns, pd.MultiIndex):
        # Only use the first level of the multi index:
        df.columns = df.columns.get_level_values(0)
    # 3.4) Reset the index:
    df.reset_index(inplace=True)
    # 3.5) Return the raw DataFrame:
    return df

raw_data = fetch_raw_data()
print(raw_data)

Price       Date       Close        High         Low        Open    Volume
0     2020-01-07   69.417580   69.898350   69.270107   69.646760  30054000
1     2020-01-08   69.964615   70.326314   69.293024   69.354799  30560000
2     2020-01-09   70.737328   71.110985   70.261035   70.774198  30018000
3     2020-01-10   71.230568   71.489586   70.663605   71.122451  36414000
4     2020-01-13   71.703865   71.768133   71.045730   71.549421  33046000
...          ...         ...         ...         ...         ...       ...
1253  2024-12-30  192.690002  193.779999  190.360001  190.865005  12209500
1254  2024-12-31  190.440002  193.250000  189.580002  192.445007  14355200
1255  2025-01-02  190.630005  193.199997  188.710007  191.485001  17545200
1256  2025-01-03  193.130005  194.500000  191.350006  192.725006  12875000
1257  2025-01-06  197.960007  199.559998  195.059998  195.149994  19473200

[1258 rows x 6 columns]


In [3]:
# Exercice 4: Create a function to process data in order to compute features:
# Ensure that data is cleaned (Nan values are removed):
from ta.momentum import StochasticOscillator
from ta.momentum import ROCIndicator
from ta.trend import MACD
from ta.volume import OnBalanceVolumeIndicator

# Main async function responsible for orchestrating logic computation of the featureset X:
async def compute_financial_features(df: pd.DataFrame) -> pd.DataFrame:
    # Call helper functions to compute financial features:
    df = await compute_RSI(df)
    if df is None:
        print("An error has occured in the computation of the RSI.")
        return None
    df = await compute_StochasticOscillators(df)
    if df is None:
        print("An error has occured in the computation of stochastic oscillators.")
        return None
    df = await compute_proc(df)
    if df is None:
        print("An error has occured in the computation of the PROC.")
        return None
    df = await compute_macd(df)
    if df is None:
        print("An error has occured in the computation of MACD.")
        return None
    df = await compute_obv(df)
    if df is None:
        print("An error has occured in the computation of OBV.")
        return None
    # Remove Nan Values
    df = df.dropna()
    return df
    
# Async helper function to compute RSI:
async def compute_RSI(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure the dataFrame is not empty:
    if df.empty:
        print("The DataFrame passed as input to compute features is empty.")
        return None
    # Compute Price delta ΔP (DeltaP):
    df["DeltaP"] = df["Close"].diff()
    # Set the nbr of days = 14:
    nbr_days = 14
    # Create 2 parallel DataFrames:
    up_df = df["DeltaP"].copy()
    down_df = df["DeltaP"].copy()
    # For up days if ΔP<0 => up_df=0. for Down days, if ΔP>0 => down_df=0:
    up_df[up_df<0] = 0
    down_df[down_df>0] = 0
    # Ensure down_df has no negative values:
    down_df = down_df.abs()
    # compute Exponential Weighted Moving Average (EWMA): Give more weight to more recent prices:
    ewma_up = up_df.transform(lambda x: x.ewm(span=nbr_days).mean())
    ewma_down = down_df.transform(lambda x: x.ewm(span=nbr_days).mean())
    # Compute the Relative Strength:
    relative_strength = ewma_up/ewma_down
    # Compute the Relative Strenght Index (RSI):
    rsi = 100 - (100/1.0+relative_strength)
    # Add the computed rsi into the original dataframe that will contain all financial features:
    df["RSI"] = rsi
    # Clean the original dataframe to remove ΔP (DeltaP):
    df.drop(columns=["DeltaP"], axis=1, inplace=True)
    # Assess if the RSI column is empty or contains Nan values:
    if df["RSI"].isnull().all():
        print("The RSI feature is either empty or contains Nan values.")
        return None
    return df

# Async helper function to compute Stochastic Oscillators = {MACD, MACD_Signal, MACD_Diff}:
async def compute_StochasticOscillators(df: pd.DataFrame, window=14, smooth_window=3) -> pd.DataFrame:
    # 1) Initialize the stochastic Oscillator:
    stoch = StochasticOscillator(high=df["High"],low=df["Low"],close=df["Close"],window=window,smooth_window=smooth_window)
    # 2) Compute the K%:
    df["K%"] = stoch.stoch()
    # 3) Compute the D% (Moving avr of K%):
    df["D%"] = stoch.stoch_signal()
    # 4) Compute the R%:
    df["R%"] = -100*(df["High"].rolling(window=window).max()-df["Close"]) / (df["High"].rolling(window=window).max()-df["Low"].rolling(window=window).min())
    # 5) Assert whether the features K%, D% & R% are empty:
    if df[["K%", "D%", "R%"]].isnull().all().any():
        print("One of more stochastic oscillators features are empty.")
        return None
    return df

# Async helper function to compute the PROC:
async def compute_proc(df: pd.DataFrame, window=14) -> pd.DataFrame:
    # 1) Initialize the PROC Indicator:
    roc = ROCIndicator(close=df["Close"],window=window)
    # 2) Compute PROC:
    df["PROC"] = roc.roc()
    # 3) Assess whether the PROC column isn't empty:
    if df["PROC"].isnull().all():
        print("The PROC column is either empty or only contains Nan values.")
        return None
    return df

# Async helper function to compute the MACD:
async def compute_macd(df: pd.DataFrame, window_slow=26, window_fast=12, window_sign=9) -> pd.DataFrame:
    # 1) Initialize the MACT indicator:
    macd = MACD(close=df["Close"], window_slow=window_slow, window_fast=window_fast, window_sign=window_sign)
    # 2) Compute the MACD:
    df["MACD"] = macd.macd()
    # 3) Compute MACD Signal:
    df["MACD_Signal"] = macd.macd_signal()
    # 4) Compute MACD Diff:
    df["MACD_Diff"] = macd.macd_diff()
    # 5) Assess if one of all columns are empty:
    if df[["MACD", "MACD_Signal", "MACD_Diff"]].isnull().all().any():
        print("One or more of the MACD indocators is either empty or only contains Nan values.")
        return None
    return df

# Async helpter function to compute the OBV (On Balance Value):
async def compute_obv(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Initialize the OBV indicator:
    obv = OnBalanceVolumeIndicator(close=df["Close"], volume=df["Volume"])
    # 2) Compute OBV:
    df["OBV"] = obv.on_balance_volume()
    # 3) Assess whether the OBV column is empty or contains NanValues:
    if df["OBV"].isnull().all():
        print("The OBV feature is either empty or contains Nan values")
        return None
    return df

df_features = await compute_financial_features(raw_data)
print(df_features)
    
    

Price       Date       Close        High         Low        Open    Volume  \
33    2020-02-25   69.173950   71.649552   68.872535   71.393476  49566000   
34    2020-02-26   69.409607   70.531583   68.703146   69.557080  44048000   
35    2020-02-27   65.668549   68.339646   65.622708   67.859171  59566000   
36    2020-02-28   66.726746   66.816921   63.322477   63.646314  75782000   
37    2020-03-02   69.206833   69.294523   66.103238   67.338546  48630000   
...          ...         ...         ...         ...         ...       ...   
1253  2024-12-30  192.690002  193.779999  190.360001  190.865005  12209500   
1254  2024-12-31  190.440002  193.250000  189.580002  192.445007  14355200   
1255  2025-01-02  190.630005  193.199997  188.710007  191.485001  17545200   
1256  2025-01-03  193.130005  194.500000  191.350006  192.725006  12875000   
1257  2025-01-06  197.960007  199.559998  195.059998  195.149994  19473200   

Price       RSI         K%         D%         R%       PROC    

In [4]:
# Exercice 5: Generate the prediction column:
async def generate_prediction_column(df: pd.DataFrame) -> pd.DataFrame:
    # 1) Set Lebel Trend: if close_f < close_i => Y=0(BULLISH) ELSE Y=1(BULLISH):
    closed_groups = df["Close"].transform(lambda x:x.shift(1)<x)
    # 2) Convert Boolean values to binary: {1/0}:
    closed_groups = closed_groups * 1
    # 3) create a New column called prediction Y:
    df["Prediction"] = closed_groups
    # 4) Clean data and remove unwanted features:
    df = df.drop(["Open", "High", "Low", "Close", "Volume"], axis=1)
    # 5) Assess whether the prediction column only contains either 0 or 1 values:
    if not df["Prediction"].isin([0,1]).all():
        print("Error the prediction column doesn't contain binary values.")
        return None
    return df

prediction_df = await generate_prediction_column(df_features)
print(prediction_df)
    
    

Price       Date       RSI         K%         D%         R%       PROC  \
33    2020-02-25 -0.246131   4.041224  22.691159 -95.958776  -4.050948   
34    2020-02-26 -0.287759   9.261552   7.250754 -90.738448  -3.801191   
35    2020-02-27 -0.163266   0.428083   4.576953 -99.571917 -10.712424   
36    2020-02-28 -0.304466  26.169445  11.953027 -73.830555  -9.457622   
37    2020-03-02 -0.686310  45.234483  23.944004 -54.765517  -7.925477   
...          ...       ...        ...        ...        ...        ...   
1253  2024-12-30 -1.145247  49.579421  65.945007 -50.420579   8.802934   
1254  2024-12-31 -0.896458  25.150453  47.520332 -74.849547   2.096180   
1255  2025-01-02 -0.917625  25.802588  33.510821 -74.197412  -3.090845   
1256  2025-01-03 -1.238980  40.944917  30.632653 -59.055083  -0.258224   
1257  2025-01-06 -1.955354  70.199908  45.649137 -29.800092   3.438187   

Price      MACD  MACD_Signal  MACD_Diff         OBV  Prediction  
33     0.194169     0.848165  -0.653996   229

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Prediction"] = closed_groups


In [8]:
# Exercice 6: Time serie cross validation & GRID SEARCH => Predict next closing price:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error, confusion_matrix

async def predict_next_closing_price_trend(df: pd.DataFrame) -> int:
    # 1) Divide the Dataset D into features=X and labels=Y:
    X = df[["RSI", "K%", "D%", "R%", "PROC", "MACD", "MACD_Signal", "MACD_Diff", "OBV"]]
    y = df["Prediction"]
    # 2) Perform Time Serie Cross Validation to evaluate the initial model:
    # Purpose: Evaluate how the model performs on unseen data:
    tscv = TimeSeriesSplit(n_splits=10) # set to 10 splits:
    accuracy_scores = [] # Store accuracy of each fold:
    # 2.1) For each fold split into training X_train, y_train, Y_test and y_test:
    # tscv.split(X) generates the training and testing indices for each fold:
    # enumerate(...): Adds an index (fold_idx) to track which fold is being processed:
    for fold_idx, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # 2.2) Train a Random Forest model:
        forest = RandomForestClassifier(
            n_estimators=100, # Nbr of decision trees in the forest:
            max_depth=10, # Sets the max depth of each trees = max level of splits:
            max_features='sqrt', # How many features are considered for splitting at each node:
            random_state=42, # Ensure reproductibility:
            criterion='gini', # measures how often a randomly chosen element from the dataset would be incorrectly classified based on the split:
            oob_score=True # Each tree is tested on data points left in the bag:
        )

        # 2.3) Train the model by injecting the training feature and label:
        forest.fit(X_train, y_train)
        # 2.4) Evaluate the model on test set:
        y_pred = forest.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        print(f"Fold {fold_idx + 1} - Accuracy: {accuracy:.4f}")

    # 3) Compute the average accuracy across folds:
    avg_accuracy = sum(accuracy_scores)/len(accuracy_scores)
    print(f"Average Accuracy across folds: {avg_accuracy:.4f}")

    # 4) Initialize the Grid Search:
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "max_features": ["sqrt", "log2"],
        "criterion": ["gini", "entropy"]
    }
    
    forest = RandomForestClassifier(random_state=42, oob_score=True)
    # 5) Instantiate grid search:
    grid_search = GridSearchCV(
        estimator=forest, # model:
        param_grid=param_grid, # grid search parameters to test:
        cv=TimeSeriesSplit(n_splits=5),
        scoring="accuracy",
        n_jobs=-1
    )

    # 6) Perform grid search on the entire dataset:
    grid_search.fit(X,y)
    # 7) Get the fine tunned model:
    best_forest = grid_search.best_estimator_
    print(f'Best Parameters: {grid_search.best_params_}')
    # 8) Train the final model on the entire dataset with best parameters
    best_forest.fit(X, y)
    # If OOB score is available, print it
    if hasattr(best_forest, 'oob_score_'):
        print(f'Final OOB Score: {best_forest.oob_score_:.4f}')
    # 9) Predict tomorrow's trend using the final model
    latest_data = X.iloc[-1:]  # Latest available data (last row of X)
    next_day_prediction = best_forest.predict(latest_data)

    print(f"Tomorrow's closing price trend prediction: {'Higher' if int(next_day_prediction[0]) == 1 else 'Lower'}")
    return int(next_day_prediction[0])
    
    
prediction = await predict_next_closing_price_trend(prediction_df)
print(prediction)
    
    

Fold 1 - Accuracy: 0.7027
Fold 2 - Accuracy: 0.7297
Fold 3 - Accuracy: 0.8198
Fold 4 - Accuracy: 0.8288
Fold 5 - Accuracy: 0.8559
Fold 6 - Accuracy: 0.8018
Fold 7 - Accuracy: 0.8288
Fold 8 - Accuracy: 0.9009
Fold 9 - Accuracy: 0.8378
Fold 10 - Accuracy: 0.8378
Average Accuracy across folds: 0.8144
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}
Final OOB Score: 0.8090
Tomorrow's closing price trend prediction: Higher
1
