In [1]:
# connexion a wrds 
import wrds
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
pgpass file created at C:\Users\Gabri\AppData\Roaming\postgresql\pgpass.conf
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [21]:
# IMPORTS
import wrds
import pandas as pd
import numpy as np  # pour np.random.choice

np.random.seed(42)  # pour reproductibilité

# --- 1️⃣ Connexion WRDS ---
def connect_wrds():
    print(">>> Connexion WRDS...")
    db = wrds.Connection()
    print(">>> Connexion WRDS OK.\n")
    return db

# --- 2️⃣ Échantillon aléatoire de 50 entreprises S&P500 ---
def get_sp500_sample(df, db):
    """
    Retourne un échantillon de 50 tickers du S&P500 avec EPS disponibles.
    df : dataframe initial CRSP (permno + ticker)
    db : connexion WRDS
    """
    import numpy as np
    np.random.seed(42)  # reproductibilité

    # Tickres uniques
    unique_tickers = df["ticker"].dropna().unique()

    # Vérifier quels tickers ont des données EPS
    valid_tickers = []
    for ticker in unique_tickers:
        query = f"""
            SELECT 1
            FROM comp.fundq
            WHERE tic = '{ticker}'
            LIMIT 1
        """
        result = db.raw_sql(query)
        if not result.empty:
            valid_tickers.append(ticker)

    print(f">>> {len(valid_tickers)} tickers ont au moins un EPS disponible.")

    # Échantillon aléatoire de 50 tickers
    if len(valid_tickers) < 50:
        print("Attention : moins de 50 tickers avec EPS, on prend tout ce qui est disponible.")
        sample_tickers = valid_tickers
    else:
        sample_tickers = np.random.choice(valid_tickers, size=50, replace=False)

    # Construire le dataframe final (ticker + permno)
    sample = df[df["ticker"].isin(sample_tickers)][["permno", "ticker"]].drop_duplicates()

    print(f">>> Échantillon final : {len(sample['ticker'].unique())} tickers sélectionnés.")
    return sample


# --- 3️⃣ Extraction des prix CRSP ---
def get_prices(db, sample):
    permno_list = "', '".join(sample['permno'].astype(str).unique())
    query = f"""
        SELECT date, prc, permno
        FROM crsp.dsf
        WHERE permno IN ('{permno_list}')
        ORDER BY permno, date
    """
    prices = db.raw_sql(query)
    print(">>> Prix téléchargés.\n")
    return prices

# --- 4️⃣ Extraction des EPS Compustat ---
def get_eps(db, sample):
    tic_list = "', '".join(sample['ticker'].unique())
    query = f"""
        SELECT gvkey, tic AS ticker, datadate, epspxq
        FROM comp.fundq
        WHERE tic IN ('{tic_list}')
        ORDER BY gvkey, datadate
    """
    eps = db.raw_sql(query)
    print(">>> EPS téléchargés.\n")
    return eps

# --- 5️⃣ Jointure CRSP ↔ EPS via ticker ---
def merge_prices_eps(prices, eps, sample):
    merged = prices.merge(sample, on='permno', how='left')
    merged = merged.merge(eps, on='ticker', how='left')
    
    # Filtrer EPS publié avant la date du prix
    merged = merged[merged['datadate'] <= merged['date']]
    print(">>> Jointure CRSP + EPS effectuée.\n")
    return merged

# --- 6️⃣ Calcul Trailing P/E ---
def compute_trailing_PE(merged):
    merged = merged.sort_values(['permno', 'date'])
    merged['eps_ttm'] = merged.groupby('permno')['epspxq'].rolling(4).sum().reset_index(level=0, drop=True)
    merged['eps_ttm'] = merged.groupby('permno')['eps_ttm'].ffill()
    merged['trailing_pe'] = merged['prc'] / merged['eps_ttm']
    print(">>> Trailing EPS & P/E calculés.\n")
    return merged

# --- 7️⃣ Pipeline complet ---
def run_pipeline():
    db = connect_wrds()

    # Charger la liste S&P500 depuis CRSP
    df = db.raw_sql("""
        SELECT permno, ticker
        FROM crsp.msenames
        WHERE shrcd IN (10,11)
        AND exchcd IN (1,2,3)
    """)

    sample = get_sp500_sample(df, db)
    prices = get_prices(db, sample)
    eps = get_eps(db, sample)

    merged = merge_prices_eps(prices, eps, sample)
    final = compute_trailing_PE(merged)
    
    # Filtrer pour les dates après 2003
    final['date'] = pd.to_datetime(final['date'])
    final = final[final['date'] > "2003-01-01"]

    print(final.head())
    return final

# --- 8️⃣ Exécution ---
if __name__ == "__main__":
    final_df = run_pipeline()
    final_df.to_csv("SP500_trailing_PE.csv", index=False)
    print(">>> Pipeline terminé, CSV sauvegardé.\n")


>>> Connexion WRDS...
WRDS recommends setting up a .pgpass file.
pgpass file created at C:\Users\Gabri\AppData\Roaming\postgresql\pgpass.conf
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
>>> Connexion WRDS OK.

>>> 13488 tickers ont au moins un EPS disponible.
>>> Échantillon final : 50 tickers sélectionnés.
>>> Prix téléchargés.

>>> EPS téléchargés.

>>> Jointure CRSP + EPS effectuée.

>>> Trailing EPS & P/E calculés.

              date    prc  permno ticker   gvkey    datadate  epspxq  eps_ttm  \
2298858 2021-06-30  26.06   11628   CPBI  043152  2021-06-30    <NA>      NaN   
2298875 2021-07-01  26.17   11628   CPBI  043152  2021-06-30    <NA>      NaN   
2298892 2021-07-02  25.81   11628   CPBI  043152  2021-06-30    <NA>      NaN   
2298909 2021-07-06  25.29   11628   CPBI  043152  2021-06-30    <NA>      NaN   
2298926 2021-07-07  24.85   11628   CPBI  043152  2021-06-30    <

In [43]:
# nettoyage du df 
def nettoyage_df(df):
    # Renommer les colonnes proprement
    df = df.rename(columns={
        "prc": "prix",
        "epspxq": "EPS",
        "eps_ttm": "EPS_MOBILE",
        "trailing_pe": "Prix_bene"
    })
    
    # Trier les données par entreprise et par date
    df = df.sort_values(['permno', 'date'])

    # Supprimer une ou plusieurs colonnes
    df = df.drop(columns=['EPS_MOBILE', 'Prix_bene'])
    
    # Remplir tous les NaN dans EPS avec ffill + bfill
    df['EPS'] = df.groupby('permno')['EPS'].transform(lambda x: x.ffill().bfill())
    
    # calcule de ESP mobile sans NaN 
    df['EPS_MOBILE'] = df.groupby('permno')['EPS'].rolling(4).sum().reset_index(level=0, drop=True)

    # Remplir tous les NaN dans EPS_mobile avec ffill + bfill
    df['EPS_MOBILE'] = df.groupby('permno')['EPS_MOBILE'].transform(lambda x: x.ffill().bfill())

    # calcule du ratio p/b sans NaN 
    df['Prix_bene'] = df['prix'] / df['EPS_MOBILE']

    # Supprimer toutes les lignes où EPS_MOBILE ou Prix_bene est NaN
    df = df.dropna(subset=['EPS_MOBILE', 'Prix_bene'])

    # Calculer le nombre de NaN par colonne (inspection)
    nan_summary = df.groupby("ticker").agg({
        "EPS_MOBILE": lambda x: x.isna().sum(),
        "EPS": lambda x: x.isna().sum(),
        "Prix_bene": lambda x: x.isna().sum(),
        "prix": lambda x: x.isna().sum()
    })
    print("Nombre de NaN par ticker :\n", nan_summary)
    
    # Calcul du pourcentage de NaN pour EPS
    nan_eps_pct = df.groupby('ticker')['EPS'].apply(lambda x: x.isna().mean())
    
    # Garder uniquement les tickers avec moins de 30 % de NaN sur EPS
    tickers_valides = nan_eps_pct[nan_eps_pct <= 0.30].index
    df = df[df['ticker'].isin(tickers_valides)]
    
    # Filtrage des dates (> 2003)
    df['date'] = pd.to_datetime(df['date'])
    df = df[df['date'] > "2003-01-01"]
    
    # Dates min/max pour vérification
    print("Date minimale :", df['date'].min())
    print("Date maximale :", df['date'].max())

    # Convertir en float
    float_cols = ['prix', 'EPS', 'EPS_MOBILE', 'Prix_bene']
    for col in float_cols:
        df[col] = pd.to_numeric(df[col])
    
    # Convertir la date
    df['date'] = pd.to_datetime(df['date'])
    
    # Vérifier le résultat
    print("type de données")
    print(df.dtypes)

    # trie par date et ticker 
    df = df.sort_values(['ticker', 'date'])
    
    return df

# résultat 
df = nettoyage_df(final_df)
df.head()

Nombre de NaN par ticker :
         EPS_MOBILE  EPS  Prix_bene  prix
ticker                                  
ASHW             0  0.0        0.0   0.0
BMTC             0  0.0        0.0   0.0
COSN             0  0.0        0.0   0.0
CPBI             0  0.0        0.0   0.0
CRRC             0  0.0        0.0   0.0
CRZY             0  0.0        0.0   0.0
FACE             0  0.0        0.0   0.0
FAT              0  0.0        0.0   0.0
HOOK             0  0.0        0.0   0.0
HOTJ             0  0.0        0.0   0.0
IRM              0  0.0        0.0   0.0
MEJ              0  0.0        0.0   0.0
MMCE             0  0.0        0.0   0.0
MNTS             0  0.0        0.0   0.0
PPBI             0  0.0        0.0   0.0
QEPC             0  0.0        0.0   0.0
SF               0  0.0        0.0   0.0
SIBN             0  0.0        0.0   0.0
STC              0  0.0        0.0   0.0
VHS              0  0.0        0.0   0.0
WD               0  0.0        0.0   0.0
XPO              0  0.0      

Unnamed: 0,date,prix,permno,ticker,gvkey,datadate,EPS,EPS_MOBILE,Prix_bene
12806727,2003-01-02,6.9,76322,ASHW,21519,1988-01-31,-0.01,-0.11,-62.727273
12806728,2003-01-02,6.9,76322,ASHW,21519,1988-04-30,-0.07,-0.11,-62.727273
12806729,2003-01-02,6.9,76322,ASHW,21519,1988-07-31,-0.02,-0.11,-62.727273
12806730,2003-01-02,6.9,76322,ASHW,21519,1988-10-31,-0.01,-0.11,-62.727273
12806731,2003-01-02,6.9,76322,ASHW,21519,1989-01-31,-0.02,-0.12,-57.5
