Project Big Data Framework by Augustin SAMIER and Benjamin AUER, GR03

In [29]:
import pandas as pd
import yfinance as yf
import yahoo_fin.stock_info as si
from yahoo_fin.stock_info import get_data
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from ipywidgets import interact, widgets,Output,VBox
from datetime import timedelta,datetime
from IPython.display import display, clear_output

1. Exploration

We can take APPLE as a first example:

In [30]:
nas_aapl=get_data("aapl",start_date="11/30/2019",end_date="11/30/2024",index_as_date =False,interval="1d") #différents interval (1m to 3months)
nas_aapl

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2019-12-02,66.817497,67.062500,65.862503,66.040001,64.024628,94487200,AAPL
1,2019-12-03,64.577499,64.882500,64.072502,64.862503,62.883053,114430400,AAPL
2,2019-12-04,65.267502,65.827499,65.169998,65.434998,63.438084,67181600,AAPL
3,2019-12-05,65.947502,66.472504,65.682503,66.394997,64.368782,74424400,AAPL
4,2019-12-06,66.870003,67.750000,66.824997,67.677498,65.612167,106075600,AAPL
...,...,...,...,...,...,...,...,...
1253,2024-11-22,228.059998,230.720001,228.059998,229.869995,229.869995,38168300,AAPL
1254,2024-11-25,231.460007,233.250000,229.740005,232.869995,232.869995,90152800,AAPL
1255,2024-11-26,233.330002,235.570007,233.330002,235.059998,235.059998,45986200,AAPL
1256,2024-11-27,234.470001,235.690002,233.809998,234.929993,234.929993,33498400,AAPL


In [31]:
print(nas_aapl.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      1258 non-null   datetime64[ns]
 1   open      1258 non-null   float64       
 2   high      1258 non-null   float64       
 3   low       1258 non-null   float64       
 4   close     1258 non-null   float64       
 5   adjclose  1258 non-null   float64       
 6   volume    1258 non-null   int64         
 7   ticker    1258 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 78.8+ KB
None


In [32]:
print(nas_aapl["close"].describe())

count    1258.000000
mean      151.100835
std        41.033508
min        56.092499
25%       127.107498
50%       151.019997
75%       177.244999
max       237.330002
Name: close, dtype: float64


2. Pre-processing

First we can check how much different tickers there are in the NASDAQ stock market as we want to work on this specific market

In [33]:
nas_list=si.tickers_nasdaq()
print("Tickers in Nasdaq:",len(nas_list))
print(nas_list[0:30])
nasdaq_list=nas_list[0:30]

Tickers in Nasdaq: 4790
['AACG', 'AADI', 'AADR', 'AAL', 'AAME', 'AAOI', 'AAON', 'AAPB', 'AAPD', 'AAPL', 'AAPU', 'AAXJ', 'ABAT', 'ABCL', 'ABCS', 'ABEO', 'ABL', 'ABLLL', 'ABLLW', 'ABLV', 'ABLVW', 'ABNB', 'ABOS', 'ABP', 'ABPWW', 'ABSI', 'ABTS', 'ABUS', 'ABVC', 'ABVE']


We put the tickers we want in a dataframe so we can access it by the name of the ticker:

In [34]:
dfday=pd.DataFrame()
dfmin=pd.DataFrame()
dateToday=datetime.today().strftime("%Y-%m-%d")
date7days=(datetime.today()-timedelta(days=7)).strftime("%Y-%m-%d")
valid_nasdaq_list=[]

for ticker in nasdaq_list:
    try:
        data_tickers_min=get_data(ticker,start_date=date7days,index_as_date=True,interval="1m")
        data_tickers_d= get_data(ticker,start_date="11/30/2014",index_as_date=True,interval="1d")
        if((data_tickers_min["close"].count()>100)and(data_tickers_d["close"].count()>100)): #we put this treshold to remove tickers with small amount of data
            dfmin=pd.concat([dfmin,data_tickers_min])
            dfday=pd.concat([dfday,data_tickers_d])
            valid_nasdaq_list.append(ticker)
        else:
            print(f"{ticker} removed")
    except:
        print(f"{ticker} not avalaible now")

def dataEng(data):
    df=data
    df.reset_index(inplace=True)
    df.rename(columns={"index":"date"}, inplace=True)
    df["date"]=pd.to_datetime(df["date"]) #To put the right date type
    df["ticker"]=df["ticker"].astype("string") #Was an object type and we put it as a String type
    df=df.dropna()
    return df

df_day=dataEng(dfday)
df_min=dataEng(dfmin)

def calculSMA(df,time):
    return df.groupby("ticker")["close"].transform(lambda x:x.rolling(window=time).mean())

df_day["return"]=df_day.groupby("ticker")["close"].pct_change()
df_day["SMA50"]=calculSMA(df_day,50) #SMA (Simple Moving Average) for 50 days
df_day["SMA200"]=calculSMA(df_day,200)  #for 200 days


AACG removed
AADR removed
AAME removed
ABCS removed
ABLLW removed
ABLV removed
ABLVW not avalaible now
ABPWW removed
ABTS removed


In [35]:
df_day.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,return,SMA50,SMA200
0,2018-02-16,69.75,72.75,64.199997,67.5,67.5,1527,AADI,,,
1,2018-02-20,65.699997,66.0,63.75,66.0,66.0,353,AADI,-0.022222,,
2,2018-02-21,60.0,74.849998,60.0,71.25,71.25,733,AADI,0.079545,,
3,2018-02-22,67.650002,71.25,66.75,68.745003,68.745003,213,AADI,-0.035158,,
4,2018-02-23,65.25,67.5,65.25,67.5,67.5,360,AADI,-0.01811,,


In [36]:
df_min.head()

Unnamed: 0,date,open,high,low,close,volume,ticker
0,2024-12-24 14:30:00,2.8,2.8,2.7519,2.7519,22865.0,AADI
3,2024-12-24 14:33:00,2.77,2.77,2.77,2.77,523.0,AADI
4,2024-12-24 14:34:00,2.76,2.7677,2.76,2.764,3733.0,AADI
5,2024-12-24 14:35:00,2.77,2.77,2.77,2.77,394.0,AADI
6,2024-12-24 14:36:00,2.77,2.77,2.77,2.77,0.0,AADI


In [37]:
print(df_min.isna().sum())

date      0
open      0
high      0
low       0
close     0
volume    0
ticker    0
dtype: int64


In [38]:
for ticker in valid_nasdaq_list:
    counter=df_min[df_min["ticker"]==ticker]["ticker"].count()
    print(f"{ticker} : {counter}")

AADI : 600
AAL : 1378
AAOI : 1319
AAON : 709
AAPB : 228
AAPD : 724
AAPL : 1379
AAPU : 770
AAXJ : 465
ABAT : 1378
ABCL : 1274
ABEO : 395
ABL : 533
ABLLL : 201
ABNB : 1365
ABOS : 875
ABP : 1041
ABSI : 1289
ABUS : 927
ABVC : 223
ABVE : 102


We calculate the sharp return ratio and explain the meaning of it

In [39]:
sharpReturnDf=pd.DataFrame()
sharpReturnDf["ticker"]=valid_nasdaq_list
risk_free=0.02/252 #2%/per year cause there are 252 days of open stock market per year

for ticker in valid_nasdaq_list:
    tick=yf.Ticker(ticker)
    info=tick.info
    
    peRatio=info.get("trailingPE")
    betaRatio=info.get("beta")
    revenueGrowth=info.get("revenueGrowth")
    dailyVolume=info.get("volume")
    averageVolume=info.get("averageVolume")
    
    dfreturn=df_day[df_day["ticker"]==ticker]
    returnR=dfreturn["return"].mean()
    vola=dfreturn["return"].std()

    latestClose=dfreturn["close"].iloc[-1]
    sma50=dfreturn["SMA50"].iloc[-1]if not dfreturn["SMA50"].isna().all() else None
    sma200=dfreturn["SMA200"].iloc[-1]if not dfreturn["SMA200"].isna().all() else None

    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"latestClose"]=latestClose
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"SMA50"]=sma50
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"SMA200"]=sma200
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"sharpReturn"]=(returnR-risk_free)/vola
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"peRatio"]=peRatio
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"betaRatio"]=betaRatio
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"vola"]=vola
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"revenueGrowth"]=revenueGrowth
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"dailyVolume"]=dailyVolume
    sharpReturnDf.loc[sharpReturnDf["ticker"]==ticker,"averageVolume"]=averageVolume

In [40]:
def sharpRatioLabel(ratio):
    if ratio<0:
        return "Bad"
    if ((ratio>0) & (ratio<1)):
        return "Not so bad"
    if ((ratio>=1) & (ratio<2)):
        return "Good"
    if (ratio >=2):
        return "Amazing"

def longTermScore(line):
    score=0
    if ((line["peRatio"]!=None) and (line["peRatio"]<20)): #PE ratio is how much investor pays to get a $ of benefice
        score+=3 #PE ratio is 1.5 more important than the revenue growth and the beta ratio -> PE ratio <20 -> company under-evaluated
    if ((line["revenueGrowth"]!=None)and(line["revenueGrowth"]>0.1)):
        score+=2 #ratio of revenue growth is how much % the revenues of the company grew -> 0.1=10% 
    if ((line["betaRatio"]!=None)and(line["betaRatio"]<1)):
        score+=2 #betaratio is the volability of comparated to the global market -> if < 1 then it's less volatible than the global market
    if ((line["averageVolume"]!=None)and(line["averageVolume"]>1000000)):
        score += 1 #We count the average volume of transaction as a criteria for long term investments -> meaning it's pretty active
    if ((line["latestClose"]!=None)and(line["SMA50"]!=None)and(line["SMA200"]!=None)and(line["latestClose"]>line["SMA200"])and(line["SMA50"]>line["SMA200"])):
        #checking if the actual price is higher than the moving average on 200 days, meaning it's actually going up, and checking if the
        #moving average on 50 days is higher than the moving average on 200 days, meaning it tends to price up
        score+=2
    return score

def shortTermScore(line):
    score=0
    if ((line["sharpReturn"]!=None)and(line["sharpReturn"]>1)):
        score+=3 #return adjusted to the risk -> we use it to see if the return is worth the risk ->> if it's >1 then the return is worth the risk
    if ((line["betaRatio"]!=None)and(line["betaRatio"]>1)):
        score+=2 #betaratio >1 so more volatible than the global market
    if ((line["vola"]!=None)and(line["vola"]>0.02)):
        score+=2 #high volability -> more likely to be good a short term investment -> volability is the "écart type" of the return (indicates if it's stable)
    if ((line["dailyVolume"]!=None)and(line["dailyVolume"]>line["averageVolume"])):
        score+=2 #if there is an un-normal recent activity then it's more likely to be a good short term investment
    if ((line["latestClose"]!=None)and(line["SMA50"]!=None)and(line["latestClose"]>line["SMA50"])): #latest close value > MA 50 days -> recent price up and activity
        score+=1
    return score

sharpReturnDf["sharpRatioMeaning"]=sharpReturnDf["sharpReturn"].apply(sharpRatioLabel)
sharpReturnDf["longTermScore"]=sharpReturnDf.apply(longTermScore,axis=1)
sharpReturnDf["shortTermScore"]=sharpReturnDf.apply(shortTermScore,axis=1)

sharpReturnDf=sharpReturnDf.sort_values(by=["longTermScore","sharpReturn"],ascending=[False,False])
sharpReturnDf.head()

Unnamed: 0,ticker,latestClose,SMA50,SMA200,sharpReturn,peRatio,betaRatio,vola,revenueGrowth,dailyVolume,averageVolume,sharpRatioMeaning,longTermScore,shortTermScore
3,AAON,119.650002,126.1324,97.8427,0.046438,52.24891,0.791,0.021741,0.168,359378.0,416425.0,Not so bad,6,2
0,AADI,2.96,2.3107,1.920125,-0.002669,,0.369,0.055998,0.21,333411.0,300940.0,Bad,6,5
19,ABVC,0.591,0.55646,0.787455,0.01993,,0.816,82.720282,18.175,232463.0,263367.0,Not so bad,4,3
12,ABL,7.75,8.2848,9.98873,0.001942,,0.148,0.02626,0.333,136281.0,188753.0,Not so bad,4,2
6,AAPL,252.199997,237.294599,213.01955,0.052333,41.480263,1.24,0.017914,0.061,33997956.0,43821504.0,Not so bad,3,3


3. Analysis and visualizations

Interface to help you chose a company in fonction of the desired term time:

In [41]:
def recommandations(termTime):
    
    if(termTime=="Long Term"):
        sortDF=sharpReturnDf.sort_values(by=["longTermScore","sharpReturn"],ascending=[False,False])
        title="Best companies to invest in for long time term investment: "
        print(f"{title}\n")
        print(sortDF[["ticker","longTermScore"]])
    else:
        sortDF=sharpReturnDf.sort_values(by=["shortTermScore","sharpReturn"],ascending=[False,False])
        title="Best companies to invest in for short time term investment: "
        print(f"{title}\n")
        print(sortDF[["ticker","shortTermScore"]])

termTime=widgets.Dropdown(
    options=["Long Term","Short Term"],
    value="Long Term",
    description="Term Time : "
)

def click(button):
    recommandations(termTime.value)

button=widgets.Button(description="Display")
button.on_click(click)
display(termTime,button)

Dropdown(description='Term Time : ', options=('Long Term', 'Short Term'), value='Long Term')

Button(description='Display', style=ButtonStyle())

Best companies to invest in for long time term investment: 

   ticker  longTermScore
3    AAON              6
0    AADI              6
19   ABVC              4
12    ABL              4
6    AAPL              3
2    AAOI              3
8    AAXJ              3
1     AAL              3
17   ABSI              3
10   ABCL              3
13  ABLLL              2
7    AAPU              2
4    AAPB              2
11   ABEO              2
15   ABOS              2
20   ABVE              2
9    ABAT              1
14   ABNB              1
18   ABUS              0
16    ABP              0
5    AAPD              0


Interface to show the variation in stock value of a company:

In [42]:
def filter_data_by_period(ticker,periode):
    dateToday=datetime.today()

    if periode=="1 Day":
        yesterday=dateToday-timedelta(days=1)
        start_date=yesterday.replace(hour=0,minute=0,second=0,microsecond=0)
    elif periode=="1 Week":
        start_date=dateToday-timedelta(weeks=1)
    elif periode=="1 Month":
        start_date=dateToday-timedelta(weeks=4)
    elif periode=="6 Months":
        start_date=dateToday-timedelta(weeks=26)
    elif periode=="1 Year":
        start_date=dateToday-timedelta(weeks=52)
    elif periode=="5 Years":
        start_date=dateToday-timedelta(weeks=260)

    if (periode=="1 Day") or (periode=="1 Week"):
        filtered=df_min[(df_min["date"]>=start_date)&(df_min["ticker"]==ticker)]
    else:
        filtered=df_day[(df_day["date"]>=start_date)&(df_day["ticker"]==ticker)]
    
    filtered=filtered.sort_values(by="date")
    return filtered

def plot_ticker_with_period(ticker,periode):
    sub=filter_data_by_period(ticker,periode)

    if not sub.empty:
        firstClose=sub["close"].iloc[0]
        lastClose=sub["close"].iloc[-1]
        var=((lastClose-firstClose)/firstClose)*100
        sma50=sharpReturnDf[sharpReturnDf["ticker"]==ticker]["SMA50"].iloc[0]
        sma200=sharpReturnDf[sharpReturnDf["ticker"]==ticker]["SMA200"].iloc[0]
    else:
        var=0

    if var>0:
        varClose=f"+{var:.2f}%"
    else:
        varClose=f"{var:.2f}%"

    if(periode=="1 Day"):
        sub.loc[sub["date"].diff()>timedelta(hours=12),"close"]=None
        sub["heure"]=sub["date"].dt.strftime("%d %H:%M")
        sub=sub.sort_values(by="date")
        x_label=sub["heure"]
    elif(periode=="1 Week"):
        sub.loc[sub["date"].diff()>timedelta(hours=12),"close"]=None
        sub=sub.sort_values(by="date")
        sub["day"]=sub["date"].dt.strftime("%d %H:%M")
        x_label=sub["day"]
    else:
        sub=sub.sort_values(by="date")
        x_label=sub["date"]
    
    fig =go.Figure()
    fig.add_trace(go.Scatter(
        x=x_label,
        y=sub["close"],
        mode="lines",
        name=f"Close value ({ticker})",
        line=dict(color="blue",width=2),
        connectgaps=False
    ))
    if(periode=="1 Day"):
        titlex="Hour"
        ntickss=24
    elif((periode=="1 Week")):
        titlex="Date"
        ntickss=7
    else:
        titlex="Date"

    if((periode=="1 Day")or(periode=="1 Week")):
        xaxiss=dict(title=titlex,type="category",nticks=ntickss,showgrid=True)
    else:
        xaxiss=dict(title=titlex,showgrid=True)

    fig.update_layout(
        title=f"Close values for {ticker} ({periode}) , {varClose}, SMA50 : {sma50:.2f}, SMA200: {sma200:.2f}",
        xaxis=xaxiss,
        yaxis_title="Close value (in $)",
        template="plotly_white"
    )
    
    fig.show()

tickers=valid_nasdaq_list
periode=["1 Day","1 Week","1 Month","6 Months","1 Year","5 Years"]

interact(
    plot_ticker_with_period,
    ticker=widgets.Dropdown(options=tickers,description="Select Ticker: "),
    periode=widgets.Dropdown(options=periode,description="Select Period: ")
)

interactive(children=(Dropdown(description='Select Ticker: ', options=('AADI', 'AAL', 'AAOI', 'AAON', 'AAPB', …

<function __main__.plot_ticker_with_period(ticker, periode)>