In [1]:
### Exercice 1: Get the DataSet and convert it from CSV to pd.DataFrame:
import pandas as pd

def convert_csv_to_df() -> pd.DataFrame:
    # 1) Specify the path to the CSV file:
    file_path = "/Users/romainkuhne/Documents/pandas_interview_training/myenv/Pandas_interview_prep/heart_disease_health_indicators_BRFSS2015.csv"
    # 2) Load the CSV into a dataFrame
    df = pd.read_csv(file_path)
    # 3) Return the converted DataFrame:
    return df

df = convert_csv_to_df()
print(df.head())


   HeartDiseaseorAttack  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0                   0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1                   0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2                   0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3                   0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4                   0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   Diabetes  PhysActivity  Fruits  ...  AnyHealthcare  NoDocbcCost  GenHlth  \
0       0.0           0.0     0.0  ...            1.0          0.0      5.0   
1       0.0           1.0     0.0  ...            0.0          1.0      3.0   
2       0.0           0.0     1.0  ...            1.0          1.0      5.0   
3       0.0           1.0     1.0  ...            1.0          0.0      2.0   
4       0.0           1.0     1.0  ...            1.0          0.0      2.0   

   MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  I

In [9]:
# Exercice 2: Utilise Random Forest to Determine which features is the most important for the occurence of Heart Disease:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score

# 2.1) Define the feature (Y) & label set (X):
X = df.drop(["HeartDiseaseorAttack", "Stroke"], axis=1) # Remove 2 labels = {HeartDiseaseOccurence, Stroke}:
Y_hd = df["HeartDiseaseorAttack"] # Label set only includes the occurence of heart disease {0:no occurence, 1:occurence}:


# 2.2) Create a helper function that performs Grid Search to fine tune hyper parameters:
async def perform_grid_search(featureSet: pd.DataFrame, labelSet: pd.Series):
    # 2.2.1) Instantiate the model:
    model = RandomForestClassifier(random_state=42)
    # 2.2.2) Define the hypeparameter grid:
    param_grid = {
        'n_estimators': [50,100,200],
        'max_depth': [5, 10, 15],
        'criterion': ['gini', 'entropy']
    }
    # 2.2.3) Define the Stratified K-fold Cross Dalidation:
    stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    # 2.2.4) Configure Grid Search with Stratified Cross Validation:
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=stratified_kfold, scoring='f1')
    # 2.2.5) Fit the Grid Search to find the best hyperparameters:
    # - iterates through all combinations of hyperparameters in param_grid.
    # A) Splits the data using the stratified k-fold method (cv=stratified_kfold):
    # B) Trains the model (RandomForestClassifier) on the training portion of each fold:
    # C) Evaluates the F1 score on the validation portion of each fold:
    grid_search.fit(featureSet, labelSet)
    # 2.2.5) Get the best model after performing grid search that max F1 score:
    best_model = grid_search.best_estimator_
    # Return the best model
    return best_model



# 2.3) Create a function that will use random forest to determine the participation of each features:
# Feature importance in Random Forest represents how much each feature contributes to making accurate predictions:
# Computed by how much a feature improves the model's split criteria (e.g., Gini impurity or entropy) across all trees in the forest:
# The contributions of each feature are summed up for all trees.
# These sums are normalized to compute the relative importance of each feature.
async def compute_feature_importance_hd(featureSet: pd.DataFrame, labelSet: pd.Series) -> pd.DataFrame:
    # 2.3.1) Call helper function to perform Stratified Grid Search W/ Cross Validation:
    best_model = await perform_grid_search(featureSet, labelSet)
    # 2.3.2) Train the model on the entire feature set
    best_model.fit(featureSet, labelSet)
    # 2.3.3) Evaluates how each feature contributes to the model’s decision-making process.
    # For each split in the Random Forest trees:
    # It checks how much splitting on a particular feature reduces the chosen impurity metric (either Gini impurity or entropy
    feature_importances = best_model.feature_importances_
    # 2.3.4) Create a DataFrame of feature importances:
    feature_importance_df = pd.DataFrame({
        'Feature': featureSet.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    return feature_importance_df

# Call the function:
feature_participation_hd_df = await compute_feature_importance_hd(X, Y_hd)
print(feature_participation_hd_df)

              Feature  Importance
17                Age    0.140224
12            GenHlth    0.124162
3                 BMI    0.113729
14           PhysHlth    0.081048
19             Income    0.070460
0              HighBP    0.060544
13           MentHlth    0.059371
18          Education    0.051050
15           DiffWalk    0.047713
1            HighChol    0.044582
16                Sex    0.039859
5            Diabetes    0.038524
4              Smoker    0.024365
7              Fruits    0.022423
6        PhysActivity    0.021112
8             Veggies    0.021041
11        NoDocbcCost    0.016234
10      AnyHealthcare    0.009622
9   HvyAlcoholConsump    0.008639
2           CholCheck    0.005298


In [2]:
# Exercice 3: Fetch financial data from a stock listed in the S&P500 for a timestamp of 5 years:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date

def fetch_raw_data(ticker="GOOG") -> pd.DataFrame:
    # 3.1) Define the timeframe from where to gather data:
    endDate = date.today() # format: 2025-01-02:
    startDate = (endDate - pd.DateOffset(years=5)).date() # format: 2020-01-02
    try: # 3.2) Attempt to fetch data:
        df = yf.download(tickers=ticker, start=startDate, end=endDate, progress=False)
        
    except Exception as e:
        print(f"Couldn't fetch data for {ticker}: str({e}).")
        return pd.DataFrame

    # 3.3) Assess if the DataFrame has a multi index, if true remove it:
    if isinstance(df.columns, pd.MultiIndex):
        # Only use the first level of the multi index:
        df.columns = df.columns.get_level_values(0)
    # 3.4) Reset the index:
    df.reset_index(inplace=True)
    # 3.5) Return the raw DataFrame:
    return df

raw_data = fetch_raw_data()
print(raw_data)
    
    
    
    
    

Price       Date       Close        High         Low        Open    Volume
0     2020-01-02   68.123718   68.162078   66.837340   66.837340  28132000
1     2020-01-03   67.789421   68.379304   67.036329   67.151713  23728000
2     2020-01-06   69.460922   69.575007   67.258334   67.258334  34646000
3     2020-01-07   69.417580   69.898350   69.270107   69.646760  30054000
4     2020-01-08   69.964615   70.326314   69.293024   69.354799  30560000
...          ...         ...         ...         ...         ...       ...
1253  2024-12-24  197.570007  197.669998  195.197998  196.169998   6809800
1254  2024-12-26  197.100006  198.160004  195.869995  196.740005   7907900
1255  2024-12-27  194.039993  196.800003  191.972000  196.470001  14693000
1256  2024-12-30  192.690002  193.779999  190.360001  190.865005  12209500
1257  2024-12-31  190.440002  193.250000  189.580002  192.445007  14355200

[1258 rows x 6 columns]
