In [1]:
import pandas as pd
import yfinance as yf
import yahoo_fin.stock_info as si
from yahoo_fin.stock_info import get_data
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from ipywidgets import interact, widgets,Output,VBox
from datetime import timedelta,datetime
from IPython.display import display
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import col,to_date,avg,stddev,mean,lit,count,when,corr,lag,last
from pyspark.sql.window import Window
from pyspark.sql.types import StructType,StructField,StringType,DoubleType,TimestampType

spark = SparkSession.builder\
    .appName("BigDataFramework")\
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.python.worker.reuse", "true") \
    .getOrCreate()

             requires requests_html, which is not installed.
             
             Install using: 
             pip install requests_html
             
             After installation, you may have to restart your Python session.


1. Exploration

We can take APPLE as a first example:

In [2]:
nas_aapl= get_data("aapl",start_date="11/30/2019",end_date="11/30/2024",index_as_date =False,interval="1d") #différents interval (1m to 3months)
nas_aapl

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2019-12-02,66.817497,67.062500,65.862503,66.040001,64.024628,94487200,AAPL
1,2019-12-03,64.577499,64.882500,64.072502,64.862503,62.883064,114430400,AAPL
2,2019-12-04,65.267502,65.827499,65.169998,65.434998,63.438091,67181600,AAPL
3,2019-12-05,65.947502,66.472504,65.682503,66.394997,64.368782,74424400,AAPL
4,2019-12-06,66.870003,67.750000,66.824997,67.677498,65.612137,106075600,AAPL
...,...,...,...,...,...,...,...,...
1253,2024-11-22,228.059998,230.720001,228.059998,229.869995,229.869995,38168300,AAPL
1254,2024-11-25,231.460007,233.250000,229.740005,232.869995,232.869995,90152800,AAPL
1255,2024-11-26,233.330002,235.570007,233.330002,235.059998,235.059998,45986200,AAPL
1256,2024-11-27,234.470001,235.690002,233.809998,234.929993,234.929993,33498400,AAPL


In [3]:
print(nas_aapl.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      1258 non-null   datetime64[ns]
 1   open      1258 non-null   float64       
 2   high      1258 non-null   float64       
 3   low       1258 non-null   float64       
 4   close     1258 non-null   float64       
 5   adjclose  1258 non-null   float64       
 6   volume    1258 non-null   int64         
 7   ticker    1258 non-null   object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 78.8+ KB
None


2. Pre-processing

First we can check how much different tickers there are in the NASDAQ stock market as we want to work on this specific market

In [4]:
nas_list=si.tickers_nasdaq()
print("Tickers in Nasdaq:",len(nas_list))
print(nas_list[0:30])
nasdaq_list=nas_list[0:30]

Tickers in Nasdaq: 4791
['AACG', 'AADI', 'AADR', 'AAL', 'AAME', 'AAOI', 'AAON', 'AAPB', 'AAPD', 'AAPL', 'AAPU', 'AAXJ', 'ABAT', 'ABCL', 'ABCS', 'ABEO', 'ABL', 'ABLLL', 'ABLLW', 'ABLV', 'ABLVW', 'ABNB', 'ABOS', 'ABP', 'ABPWW', 'ABSI', 'ABTS', 'ABUS', 'ABVC', 'ABVE']


We put the tickers we want in a dataframe so we can access it by the name of the ticker:

In [7]:
structDay=StructType([
    StructField("date",TimestampType(),True),
    StructField("open",DoubleType(),True),
    StructField("high",DoubleType(),True),
    StructField("low",DoubleType(),True),
    StructField("close",DoubleType(),True),
    StructField("adjclose",DoubleType(),True),
    StructField("volume",DoubleType(),True),
    StructField("ticker",StringType(),True)
])

structMin=StructType([
    StructField("date",TimestampType(),True),
    StructField("open",DoubleType(),True),
    StructField("high",DoubleType(),True),
    StructField("low",DoubleType(),True),
    StructField("close",DoubleType(),True),
    StructField("volume",DoubleType(),True),
    StructField("ticker",StringType(),True)
])

dfday=spark.createDataFrame([],structDay)
dfmin=spark.createDataFrame([],structMin)
dateToday=datetime.today().strftime("%Y-%m-%d")
date7days=(datetime.today()-timedelta(days=7)).strftime("%Y-%m-%d")
valid_nasdaq_list=[]

for ticker in nasdaq_list:
    try:
        data_tickers_min=get_data(ticker,start_date=date7days,index_as_date=True,interval="1m")
        data_tickers_d= get_data(ticker,start_date="11/30/2014",index_as_date=True,interval="1d")
        data_tickers_min["ticker"] = data_tickers_min["ticker"].astype("string")
        data_tickers_d["ticker"] = data_tickers_d["ticker"].astype("string")
        if((len(data_tickers_d))and(len(data_tickers_min))): #we put this treshold to remove tickers with small amount of data
            data_tickers_min=spark.createDataFrame(data_tickers_min.reset_index())
            data_tickers_d=spark.createDataFrame(data_tickers_d.reset_index())
            dfmin=dfmin.union(data_tickers_min)
            dfday=dfday.union(data_tickers_d)
            valid_nasdaq_list.append(ticker)
        else:
            print(f"{ticker} removed")
    except Exception as e:
        print(f"{ticker} is not available now : {e}")
def dataEng(data):
    df=data
    df=df.withColumn("date", to_date(col("date"))) #To put the right date type
    df=df.withColumn("variation",col("high")-col("low")) #Variation between the highest value of the day and the lowest
    df=df.na.drop()
    return df

df_day=dataEng(dfday)
df_min=dataEng(dfmin)

windowReturn=Window.partitionBy("ticker").orderBy("date")
window50=Window.partitionBy("ticker").orderBy("date").rowsBetween(-49,0)
window200=Window.partitionBy("ticker").orderBy("date").rowsBetween(-199,0)
df_day=df_day.withColumn("return",(col("close")-lag("close",1).over(windowReturn))/lag("close",1).over(windowReturn))
df_day=df_day.withColumn("SMA50",avg(col("close")).over(window50)) #SMA (Simple Moving Average) for 50 days
df_day=df_day.withColumn("SMA200",avg(col("close")).over(window200)) #for 200 days

ABLVW is not available now : 'timestamp'


In [11]:
df_day.show(5)

Py4JJavaError: An error occurred while calling o4345.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 9) (Augustin executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:108)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 35 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:67)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:108)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(Unknown Source)
	at java.net.AbstractPlainSocketImpl.accept(Unknown Source)
	at java.net.PlainSocketImpl.accept(Unknown Source)
	at java.net.ServerSocket.implAccept(Unknown Source)
	at java.net.ServerSocket.accept(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 35 more


We calculate the sharp return ratio and explain the meaning of it

In [None]:
sharpReturnDf=spark.createDataFrame(valid_nasdaq_list,"string").toDF("ticker")
risk_free=0.02/252 #2%/per year cause there are 252 days of open stock market per year

for ticker in valid_nasdaq_list:
    tick=yf.Ticker(ticker)
    info=tick.info
    
    peRatio=info.get("trailingPE")
    betaRatio=info.get("beta")
    revenueGrowth=info.get("revenueGrowth")
    dailyVolume=info.get("volume")
    averageVolume=info.get("averageVolume")
    
    dfreturn=df_day.filter(col("ticker")==ticker)
    returnR=dfreturn.agg(avg("return")).collect()[0][0] #average of the return
    vola=dfreturn.agg(stddev("return")).collect()[0][0] #standard deviation = "écart type"

    latestClose=dfreturn.select(last("close").alias("latestClose")).collect()[0]["latestClose"]
    sma50=dfreturn.select(last("SMA50").alias("latestSMA50")).collect()[0]["latestSMA50"] if dfreturn.filter(col("SMA50").isNotNull()).limit(1).count()==1 else None
    #We check if there is at least a line with a non-NULL value and if so it takes the last value of it in the column
    sma200=dfreturn.select(last("SMA200").alias("latestSMA200")).collect()[0]["latestSMA200"] if dfreturn.filter(col("SMA200").isNotNull()).limit(1).count()==1 else None

    sharpReturnDf=sharpReturnDf.withColumn("close",when(col("ticker")==lit(ticker),lit(latestClose)))
    sharpReturnDf=sharpReturnDf.withColumn("SMA50",when(col("ticker")==lit(ticker),lit(sma50)))
    sharpReturnDf=sharpReturnDf.withColumn("SMA200",when(col("ticker")==lit(ticker),lit(sma200)))
    sharpReturnDf=sharpReturnDf.withColumn("sharpReturn",when(col("ticker")==lit(ticker),lit((returnR-risk_free)/vola)))
    sharpReturnDf=sharpReturnDf.withColumn("peRatio",when(col("ticker")==lit(ticker),lit(peRatio)))
    sharpReturnDf=sharpReturnDf.withColumn("betaRatio",when(col("ticker")==lit(ticker),lit(betaRatio)))
    sharpReturnDf=sharpReturnDf.withColumn("vola",when(col("ticker")==lit(ticker),lit(vola)))
    sharpReturnDf=sharpReturnDf.withColumn("revenueGrowth",when(col("ticker")==lit(ticker),lit(revenueGrowth)))
    sharpReturnDf=sharpReturnDf.withColumn("dailyVolume",when(col("ticker")==lit(ticker),lit(dailyVolume)))
    sharpReturnDf=sharpReturnDf.withColumn("averageVolume",when(col("ticker")==lit(ticker),lit(averageVolume)))

In [14]:
def sharpRatioLabel(ratio):
    if ratio<0:
        return "Bad"
    if ((ratio>0) & (ratio<1)):
        return "Not so bad"
    if ((ratio>=1) & (ratio<2)):
        return "Good"
    if (ratio >=2):
        return "Amazing"

def longTermScore(line):
    score=0
    if ((line["peRatio"]!=None) and (line["peRatio"]<20)): #PE ratio is how much investor pays to get a $ of benefice
        score+=3 #PE ratio is 1.5 more important than the revenue growth and the beta ratio -> PE ratio <20 -> company under-evaluated
    if ((line["revenueGrowth"]!=None)and(line["revenueGrowth"]>0.1)):
        score+=2 #ratio of revenue growth is how much % the revenues of the company grew -> 0.1=10% 
    if ((line["betaRatio"]!=None)and(line["betaRatio"]<1)):
        score+=2 #betaratio is the volability of comparated to the global market -> if < 1 then it's less volatible than the global market
    if ((line["averageVolume"]!=None)and(line["averageVolume"]>1000000)):
        score += 1 #We count the average volume of transaction as a criteria for long term investments -> meaning it's pretty active
    if ((line["close"]!=None)and(line["SMA50"]!=None)and(line["SMA200"]!=None)and(line["close"]>line["SMA200"])and(line["SMA50"]>line["SMA200"])):
        #checking if the actual price is higher than the moving average on 200 days, meaning it's actually going up, and checking if the
        #moving average on 50 days is higher than the moving average on 200 days, meaning it tends to price up
        score+=2
    return score

def shortTermScore(line):
    score=0
    if ((line["sharpReturn"]!=None)and(line["sharpReturn"]>1)):
        score+=3 #return adjusted to the risk -> we use it to see if the return is worth the risk ->> if it's >1 then the return is worth the risk
    if ((line["betaRatio"]!=None)and(line["betaRatio"]>1)):
        score+=2 #betaratio >1 so more volatible than the global market
    if ((line["vola"]!=None)and(line["vola"]>0.02)):
        score+=2 #high volability -> more likely to be good a short term investment -> volability is the "écart type" of the return (indicates if it's stable)
    if ((line["dailyVolume"]!=None)and(line["dailyVolume"]>line["averageVolume"])):
        score+=2 #if there is an un-normal recent activity then it's more likely to be a good short term investment
    if ((line["close"]!=None)and(line["SMA50"]!=None)and(line["close"]>line["SMA50"])): #latest close value > MA 50 days -> recent price up and activity
        score+=1
    return score

sharpReturnDf["sharpRatioMeaning"]=sharpReturnDf["sharpReturn"].apply(sharpRatioLabel)
sharpReturnDf["longTermScore"]=sharpReturnDf.apply(longTermScore,axis=1)
sharpReturnDf["shortTermScore"]=sharpReturnDf.apply(shortTermScore,axis=1)

sharpReturnDf=sharpReturnDf.sort_values(by=["longTermScore","sharpReturn"],ascending=[False,False])
sharpReturnDf.head()

Unnamed: 0,ticker,close,SMA50,SMA200,sharpReturn,peRatio,betaRatio,vola,revenueGrowth,dailyVolume,averageVolume,sharpRatioMeaning,longTermScore,shortTermScore
3,AAON,121.419998,125.9616,97.65095,0.046718,53.254387,0.791,0.021743,0.168,236399.0,421632.0,Not so bad,6,2
0,AADI,3.21,2.2945,1.915575,-0.001865,,0.369,0.055983,0.21,363395.0,298100.0,Bad,6,5
8,AAXJ,72.989998,75.2586,72.65755,0.005234,14.801209,,0.012677,,224829.0,549109.0,Not so bad,5,0
19,ABVC,0.581,0.55558,0.7905,0.019933,,0.816,82.736597,18.175,341327.0,258488.0,Not so bad,4,5
12,ABL,7.75,8.325,10.01108,0.001945,,0.148,0.026272,0.333,299167.0,184232.0,Not so bad,4,4


In [15]:
for ticker in valid_nasdaq_list:
    counter=df_min[df_min["ticker"]==ticker]["ticker"].count()
    print(f"{ticker} : {counter}")

AADI : 652
AAL : 1377
AAOI : 1322
AAON : 705
AAPB : 246
AAPD : 659
AAPL : 1378
AAPU : 788
AAXJ : 502
ABAT : 1378
ABCL : 1256
ABEO : 393
ABL : 582
ABLLL : 173
ABNB : 1369
ABOS : 1083
ABP : 996
ABSI : 1284
ABUS : 872
ABVC : 223


In [17]:
print(df_min.isna().sum())

date         0
open         0
high         0
low          0
close        0
volume       0
ticker       0
variation    0
dtype: int64


3. Analysis and visualizations

Interface to help you chose a company in fonction of the desired term time:

In [18]:
def recommandations(termTime):
    
    if(termTime=="Long Term"):
        sortDF=sharpReturnDf.sort_values(by=["longTermScore","sharpReturn"],ascending=[False,False])
        title="Best companies to invest in for long time term investment: "
        print(f"{title}\n")
        print(sortDF[["ticker","longTermScore"]])
    else:
        sortDF=sharpReturnDf.sort_values(by=["shortTermScore","sharpReturn"],ascending=[False,False])
        title="Best companies to invest in for short time term investment: "
        print(f"{title}\n")
        print(sortDF[["ticker","shortTermScore"]])

termTime=widgets.Dropdown(
    options=["Long Term","Short Term"],
    value="Long Term",
    description="Term Time : "
)

def click(button):
    recommandations(termTime.value)

button=widgets.Button(description="Display")
button.on_click(click)
display(termTime,button)

Dropdown(description='Term Time : ', options=('Long Term', 'Short Term'), value='Long Term')

Button(description='Display', style=ButtonStyle())

Interface to show the variation in stock value of a company:

In [19]:
def filter_data_by_period(ticker,periode):
    dateToday=datetime.today()

    if periode=="1 Day":
        yesterday=dateToday-timedelta(days=1)
        start_date=yesterday.replace(hour=0,minute=0,second=0,microsecond=0)
    elif periode=="1 Week":
        start_date=dateToday-timedelta(weeks=1)
    elif periode=="1 Month":
        start_date=dateToday-timedelta(weeks=4)
    elif periode=="6 Months":
        start_date=dateToday-timedelta(weeks=26)
    elif periode=="1 Year":
        start_date=dateToday-timedelta(weeks=52)
    elif periode=="5 Years":
        start_date=dateToday-timedelta(weeks=260)

    if (periode=="1 Day") or (periode=="1 Week"):
        filtered=df_min[(df_min["date"]>=start_date)&(df_min["ticker"]==ticker)]
    else:
        filtered=df_day[(df_day["date"]>=start_date)&(df_day["ticker"]==ticker)]
    
    filtered=filtered.sort_values(by="date")
    return filtered

def plot_ticker_with_period(ticker,periode):
    sub=filter_data_by_period(ticker,periode)

    if not sub.empty:
        firstClose=sub["close"].iloc[0]
        lastClose=sub["close"].iloc[-1]
        var=((lastClose-firstClose)/firstClose)*100
    else:
        var=0

    if var>0:
        varClose=f"+{var:.2f}%"
    else:
        varClose=f"{var:.2f}%"

    if(periode=="1 Day"):
        sub.loc[sub["date"].diff()>timedelta(hours=12),"close"]=None
        sub["heure"]=sub["date"].dt.strftime("%d %H:%M")
        sub=sub.sort_values(by="date")
        x_label=sub["heure"]
    elif(periode=="1 Week"):
        sub.loc[sub["date"].diff()>timedelta(hours=12),"close"]=None
        sub=sub.sort_values(by="date")
        sub["day"]=sub["date"].dt.strftime("%d %H:%M")
        x_label=sub["day"]
    else:
        sub=sub.sort_values(by="date")
        x_label=sub["date"]
    
    fig=go.Figure()
    fig.add_trace(go.Scatter(
        x=x_label,
        y=sub["close"],
        mode="lines",
        name=f"Close value ({ticker})",
        line=dict(color="blue",width=2),
        connectgaps=False
    ))
    if(periode=="1 Day"):
        titlex="Hour"
        ntickss=24
    elif((periode=="1 Week")):
        titlex="Date"
        ntickss=7
    else:
        titlex="Date"

    if((periode=="1 Day")or(periode=="1 Week")):
        xaxiss=dict(title=titlex,type="category",nticks=ntickss,showgrid=True)
    else:
        xaxiss=dict(title=titlex,showgrid=True)

    fig.update_layout(
        title=f"Close values for {ticker} ({periode}) , {varClose}",
        xaxis=xaxiss,
        yaxis_title="Close value (in $)",
        template="plotly_white"
    )
    
    fig.show()

tickers=valid_nasdaq_list
periode=["1 Day","1 Week","1 Month","6 Months","1 Year","5 Years"]

interact(
    plot_ticker_with_period,
    ticker=widgets.Dropdown(options=tickers,description="Select Ticker: "),
    periode=widgets.Dropdown(options=periode,description="Select Period: ")
)

interactive(children=(Dropdown(description='Select Ticker: ', options=('AADI', 'AAL', 'AAOI', 'AAON', 'AAPB', …

<function __main__.plot_ticker_with_period(ticker, periode)>