In [5]:
%load_ext lab_black

In [37]:
import pandas as pd
import numpy as np
from nltk import tokenize
from datetime import datetime
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from numba import jit, prange

In [56]:
nvidia_df = pd.read_csv("../00_data/Cleaned_Data_10MAR.gz", compression="gzip")

In [158]:
nvidia_df["Date"].value_counts().sort_index()

2020-10-02    189
2020-10-03     50
2020-10-04     50
2020-10-05    275
2020-10-06    275
             ... 
2021-02-15    275
2021-02-16    275
2021-02-18    175
2021-02-19    300
2021-02-20     50
Name: Date, Length: 92, dtype: int64

In [58]:
nvidia_df.columns

Index(['Unnamed: 0', 'Index', 'videoID', 'datePub', 'searchedDate',
       'VideoTitle', 'channelTitle', 'viewCount', 'likeCount', 'dislikeCount',
       'captionString'],
      dtype='object')

In [59]:
# initialize correct types of data for each column
nvidia_df["VideoTitle"] = nvidia_df["VideoTitle"].astype("str")
nvidia_df["datePub"] = pd.to_datetime(
    nvidia_df["datePub"], format="%Y-%m-%d %H:%M:%S", utc=True
)
nvidia_df["searchedDate"] = pd.to_datetime(
    nvidia_df["searchedDate"], format="%Y-%m-%d %H:%M:%S", utc=True
)
nvidia_df["channelTitle"] = nvidia_df["channelTitle"].astype("str")
nvidia_df["viewCount"] = nvidia_df["viewCount"].astype("int")
nvidia_df["likeCount"] = nvidia_df["likeCount"].astype("int")
nvidia_df["dislikeCount"] = nvidia_df["dislikeCount"].astype("int")
nvidia_df["captionString"] = nvidia_df["captionString"].astype("str")

In [67]:
type(nvidia_df["VideoTitle"][0])

str

In [None]:
# X Variables = Aggregate the following values by search date/time 
#     videoTitleSentimentScore_pos = vader pos label for video title 
#     captionSentimentScore_pos = vader pos label for caption string
#     XXX - viewcounts_weighted = views count / number of day since published
#     XXX - like_count_weighted = like count / number of day since published
#     XXX - dislike_count_weighted = dislike count / number of day since published
#     XXX - like_dislike_ratio = like / (like + dislike)
#     XXX - weighted_like_dislike_ratio = weighted_like / (weighted_like + weighted_dislike)
#     XXX - like_dislike_ratio_by_viewcounts_weighted = like dislike ration (video potential) * viewcount_weight

In [60]:
# load vader sentiment for comparing the uncleaned strings to the modified ones
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [61]:
# print(f'Title to score = {nvidia_df["captionString"][5]}')
score = sid.polarity_scores(str(nvidia_df["captionString"][5]))
print(score)

{'neg': 0.037, 'neu': 0.846, 'pos': 0.117, 'compound': 0.9994}


In [78]:
nvidia_df["videoTitleSentimentScore_pos"] = [
    sid.polarity_scores(x)["pos"] for x in nvidia_df["VideoTitle"]
]

nvidia_df["videoTitleSentimentScore_neg"] = [
    sid.polarity_scores(x)["neg"] for x in nvidia_df["VideoTitle"]
]

nvidia_df["captionSentimentScore_pos"] = [
    sid.polarity_scores(x)["pos"] for x in nvidia_df["captionString"]
]

nvidia_df["captionSentimentScore_neg"] = [
    sid.polarity_scores(x)["neg"] for x in nvidia_df["captionString"]
]

In [79]:
nvidia_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Index,videoID,datePub,searchedDate,VideoTitle,channelTitle,viewCount,likeCount,dislikeCount,captionString,videoTitleSentimentScore_pos,videoTitleSentimentScore_neg,captionSentimentScore_pos,captionSentimentScore_neg
16608,16608,15,eKH6no71-UA,2021-01-20 21:56:49+00:00,2021-02-20 21:30:36.718454+00:00,Nvidia RTX Stock Coming - GPU Shortages Possib...,Graphically Challenged,178378,6173,1188,nvidia has some plans that may finally put an ...,0.0,0.186,0.138,0.034
16609,16609,16,msmmdMrKPuY,2021-02-16 23:15:04+00:00,2021-02-20 21:30:37.316991+00:00,Nvidia & AMD’s Answer to Cryptocurrency Miners,Moore's Law Is Dead,49322,3906,103,to be entirely honest i found the whole conve...,0.0,0.0,0.127,0.055
16610,16610,17,I2icFFqzrIg,2021-02-20 06:00:02+00:00,2021-02-20 21:30:37.957025+00:00,"✅NVIDIA МОЛОДЦЫ, RTX 3060 без майнинга, беда с...",HOMISH,13918,1675,69,,0.0,0.0,0.0,0.0
16611,16611,18,Dw4oet5f0dI,2020-10-05 12:55:38+00:00,2021-02-20 21:30:38.496794+00:00,GPU Technology Conference (GTC) Keynote Oct 20...,NVIDIA,219512,1531,83,I am an explorer. Searching for the origins of...,0.0,0.0,0.167,0.02
16612,16612,19,fVbQeO8iJh4,2021-02-19 14:00:30+00:00,2021-02-20 21:30:39.128417+00:00,Nvidia Finally Does Something About Miners,Hot News,15918,1275,30,nvidia's decided that it's had it with miners ...,0.0,0.0,0.103,0.028
16613,16613,20,cdSAJyIgmdI,2020-09-24 23:11:46+00:00,2021-02-20 21:30:39.712827+00:00,Jim Cramer breaks down Nvidia's acquisition of...,CNBC Television,66815,883,37,what do we do with the stock of nvidia now th...,0.0,0.0,0.139,0.061
16614,16614,21,YjcxrfEVhc8,2020-09-24 13:00:00+00:00,2021-02-20 21:30:40.360748+00:00,Making Nvidia’s CEO mad - RTX 3090 Review,Linus Tech Tips,3193067,142120,3797,- Nvidia's top of the line RTX 3090. (upbeat m...,0.0,0.348,0.126,0.044
16615,16615,22,iXn9O-Rzb_M,2020-12-12 03:10:26+00:00,2021-02-20 21:30:41.003955+00:00,NVIDIA might ACTUALLY be EVIL... - WAN Show De...,Linus Tech Tips,1697803,100343,1802,,0.0,0.0,0.0,0.0
16616,16616,23,HiMy1C4XVrQ,2021-01-19 23:20:39+00:00,2021-02-20 21:30:41.521426+00:00,The GPU NVIDIA didn’t tell anyone about…,Paul's Hardware,102002,7340,126,hello everyone and welcome to paul's hardware...,0.0,0.0,0.122,0.041
16617,16617,24,MyEEMZNZvjA,2021-02-10 14:00:18+00:00,2021-02-20 21:30:42.127197+00:00,Nvidia Gave Up!,Hot News,60907,3427,108,hi friends welcome to your wednesday edition o...,0.0,0.0,0.108,0.084


In [76]:
nvidia_df.iloc[:, -2:]

Unnamed: 0,videoTitleSentimentScore_pos,videoTitleSentimentScore_neg
0,0.214,0.151
1,0.403,0.000
2,0.000,0.000
3,0.000,0.000
4,0.413,0.000
...,...,...
16613,0.000,0.000
16614,0.000,0.348
16615,0.000,0.000
16616,0.000,0.000


In [175]:
nvidia_df["num_days_since_pub"] = np.ceil(
    (nvidia_df["searchedDate"] - nvidia_df["datePub"]) / np.timedelta64(1, "D")
)
nvidia_df["viewcounts_weighted"] = (
    nvidia_df["viewCount"] / nvidia_df["num_days_since_pub"]
)
nvidia_df["like_count_weighted"] = (
    nvidia_df["likeCount"] / nvidia_df["num_days_since_pub"]
)
nvidia_df["dislike_count_weighted"] = (
    nvidia_df["dislikeCount"] / nvidia_df["num_days_since_pub"]
)
nvidia_df["like_dislike_ratio"] = nvidia_df["likeCount"] / (
    nvidia_df["likeCount"] + nvidia_df["dislikeCount"]
)
nvidia_df["weighted_like_dislike_ratio"] = nvidia_df["like_count_weighted"] / (
    nvidia_df["like_count_weighted"] + nvidia_df["dislike_count_weighted"]
)
nvidia_df["like_dislike_ratio_by_viewcounts_weighted"] = (
    nvidia_df["like_dislike_ratio"] * nvidia_df["viewcounts_weighted"]
)
nvidia_df["Date"] = pd.to_datetime(
    nvidia_df["searchedDate"], format="%Y-%m-%d", utc=True
).dt.date

In [176]:
nvidia_df.columns

Index(['Unnamed: 0', 'Index', 'videoID', 'datePub', 'searchedDate',
       'VideoTitle', 'channelTitle', 'viewCount', 'likeCount', 'dislikeCount',
       'captionString', 'videoTitleSentimentScore_pos',
       'videoTitleSentimentScore_neg', 'captionSentimentScore_pos',
       'captionSentimentScore_neg', 'num_days_since_pub',
       'viewcounts_weighted', 'like_count_weighted', 'like_dislike_ratio',
       'weighted_like_dislike_ratio',
       'like_dislike_ratio_by_viewcounts_weighted', 'Date',
       'dislike_count_weighted'],
      dtype='object')

In [177]:
X_features = nvidia_df.filter(
    [
        "videoTitleSentimentScore_pos",
        "videoTitleSentimentScore_neg",
        "captionSentimentScore_pos",
        "captionSentimentScore_neg",
        "num_days_since_pub",
        "viewcounts_weighted",
        "like_count_weighted",
        "like_dislike_ratio",
        "weighted_like_dislike_ratio",
        "like_dislike_ratio_by_viewcounts_weighted",
        "Date",
    ],
    axis=1,
)

In [178]:
X_feautures_pre_merge = X_features.groupby(["Date"]).mean().reset_index()

In [179]:
X_feautures_pre_merge.tail()

Unnamed: 0,Date,videoTitleSentimentScore_pos,videoTitleSentimentScore_neg,captionSentimentScore_pos,captionSentimentScore_neg,num_days_since_pub,viewcounts_weighted,like_count_weighted,like_dislike_ratio,weighted_like_dislike_ratio,like_dislike_ratio_by_viewcounts_weighted
87,2021-02-15,0.03468,0.077251,0.107473,0.031818,115.607273,13347.7284,693.67259,0.956675,0.956675,12957.795705
88,2021-02-16,0.040633,0.05196,0.102418,0.032844,120.298182,25551.254491,1678.468242,0.960266,0.960266,25040.842574
89,2021-02-18,0.046834,0.058583,0.101794,0.032863,119.245714,16224.086329,939.047614,0.954441,0.954441,15837.504011
90,2021-02-19,0.03389,0.069657,0.097573,0.03132,113.623333,19150.754564,1316.566845,0.958693,0.958693,18586.708998
91,2021-02-20,0.04538,0.07332,0.10312,0.0347,101.6,41163.2218,3604.290577,0.958253,0.958253,40312.584142


In [180]:
# This is a function to generate response variable dataframe
# Decision rule:
# If the change in "High" in 5 days is greater than 3%, we give it label "buy"
# If the change in "High" in 5 days is smaller than -3%, we give it label "sell"
# Otherwise, we give it label "hold
def create_response(start_date, end_date, stock_name):
    import pandas as pd
    import numpy as np
    import yfinance as yf

    # get stock info
    data = (
        yf.download(stock_name, start=start_date, end=end_date)
        .asfreq("D")
        .fillna(method="ffill")
    )
    # create percentage dict
    percent_dict = {}
    for i in range(len(data) - 5):
        cur_high = data.iloc[i, :]["High"]
        future_high = data.iloc[i + 5, :]["High"]
        percent_dict[data.index[i]] = (future_high - cur_high) / cur_high
    response_df = pd.DataFrame.from_dict(
        percent_dict, orient="index", columns=["percent_change"]
    )
    # Decsion rule
    bins = [-(np.inf), -0.03, 0.03, np.inf]
    names = ["sell", "hold", "buy"]
    response_df["action"] = pd.cut(response_df["percent_change"], bins, labels=names)
    response_df = response_df.reset_index()
    return response_df

In [181]:
responce_df = create_response("2020-10-02", "2021-02-28", "NVDA")

[*********************100%***********************]  1 of 1 completed


In [182]:
responce_df

Unnamed: 0,index,percent_change,action
0,2020-10-02,0.044362,buy
1,2020-10-03,0.041791,buy
2,2020-10-04,0.022541,hold
3,2020-10-05,0.013192,hold
4,2020-10-06,-0.022469,hold
...,...,...,...
138,2021-02-17,-0.005928,hold
139,2021-02-18,-0.042660,sell
140,2021-02-19,-0.044162,sell
141,2021-02-20,-0.052860,sell


In [183]:
X_feautures_pre_merge["Date"] = pd.to_datetime(
    X_feautures_pre_merge["Date"], format="%Y-%m-%d", utc=True
).dt.date

responce_df["index"] = pd.to_datetime(
    responce_df["index"], format="%Y-%m-%d", utc=True
).dt.date

In [184]:
type(X_feautures_pre_merge["Date"][0])
X_feautures_pre_merge["Date"].tail(50)

42    2020-11-21
43    2020-11-22
44    2020-11-23
45    2020-11-24
46    2020-11-25
47    2020-11-26
48    2020-11-30
49    2020-12-02
50    2020-12-05
51    2020-12-06
52    2020-12-07
53    2020-12-08
54    2020-12-12
55    2020-12-15
56    2020-12-18
57    2020-12-19
58    2020-12-20
59    2020-12-21
60    2020-12-22
61    2020-12-24
62    2020-12-27
63    2021-01-02
64    2021-01-04
65    2021-01-07
66    2021-01-08
67    2021-01-15
68    2021-01-16
69    2021-01-17
70    2021-01-18
71    2021-01-19
72    2021-01-20
73    2021-01-21
74    2021-01-23
75    2021-01-28
76    2021-01-29
77    2021-01-30
78    2021-02-01
79    2021-02-02
80    2021-02-04
81    2021-02-06
82    2021-02-07
83    2021-02-08
84    2021-02-10
85    2021-02-11
86    2021-02-14
87    2021-02-15
88    2021-02-16
89    2021-02-18
90    2021-02-19
91    2021-02-20
Name: Date, dtype: object

In [185]:
type(responce_df["index"][0])
responce_df["index"][0]

datetime.date(2020, 10, 2)

In [186]:
X_feautures_pre_merge.merge(
    responce_df, left_on="Date", right_on="index", how="left", indicator=True
).tail(15)

Unnamed: 0,Date,videoTitleSentimentScore_pos,videoTitleSentimentScore_neg,captionSentimentScore_pos,captionSentimentScore_neg,num_days_since_pub,viewcounts_weighted,like_count_weighted,like_dislike_ratio,weighted_like_dislike_ratio,like_dislike_ratio_by_viewcounts_weighted,index,percent_change,action,_merge
77,2021-01-30,0.0433,0.09506,0.0971,0.02918,106.7,18275.533209,878.110445,0.954994,0.954994,17703.088414,2021-01-30,0.025404,hold,both
78,2021-02-01,0.03264,0.11668,0.10196,0.03156,107.0,18492.362445,893.631102,0.958608,0.958608,17963.831559,2021-02-01,0.035152,buy,both
79,2021-02-02,0.05396,0.09856,0.10176,0.0318,112.44,14544.14136,706.864278,0.954427,0.954427,14045.284284,2021-02-02,0.012176,hold,both
80,2021-02-04,0.04504,0.095436,0.092505,0.028229,119.96,16304.232208,795.245302,0.955465,0.955465,15776.310523,2021-02-04,0.066534,buy,both
81,2021-02-06,0.03686,0.08854,0.09764,0.02798,114.78,17639.624336,876.051837,0.953125,0.953125,17063.649078,2021-02-06,0.11174,buy,both
82,2021-02-07,0.04872,0.07178,0.09878,0.02872,119.12,12666.187063,598.649973,0.950765,0.950765,12222.779078,2021-02-07,0.113123,buy,both
83,2021-02-08,0.04252,0.099213,0.102007,0.03074,117.62,13667.124135,661.589898,0.953168,0.953168,13233.470458,2021-02-08,0.056482,buy,both
84,2021-02-10,0.039073,0.063269,0.099702,0.030247,117.312727,12668.794516,650.513536,0.954724,0.954724,12274.350827,2021-02-10,0.025914,hold,both
85,2021-02-11,0.046727,0.076058,0.102509,0.033382,118.323636,13181.323416,661.530333,0.956586,0.956586,12792.14821,2021-02-11,0.006564,hold,both
86,2021-02-14,0.03264,0.06496,0.10512,0.03024,120.42,12743.419259,620.86322,0.953354,0.953354,12332.074447,2021-02-14,-0.007472,hold,both


### APPENDIX

In [None]:
pd.DataFrame(
    columns=[
        "videoTitleSentimentScore_pos",
        "captionSentimentScore_pos",
        "viewcounts_weighted",
        "like_count_weighted",
        "dislike_count_weighted",
        "like_dislike_ratio",
        "like_dislike_ratio_by_viewcounts_weighted",
    ]
)

nvidia_df["videoTitleSentimentScore_pos"] = [
    sid.polarity_scores(x)["pos"] for x in nvidia_df["VideoTitle"]
]

nvidia_df["videoTitleSentimentScore_neg"] = [
    sid.polarity_scores(x)["neg"] for x in nvidia_df["VideoTitle"]
]

nvidia_df["captionSentimentScore_pos"] = [
    sid.polarity_scores(x)["pos"] for x in nvidia_df["captionString"]
]

nvidia_df["captionSentimentScore_neg"] = [
    sid.polarity_scores(x)["neg"] for x in nvidia_df["captionString"]
]

In [6]:
X_features = pd.DataFrame(
    columns=[
        "videoTitleSentimentScore_pos",
        "captionSentimentScore_pos",
        "viewcounts_weighted",
        "like_count_weighted",
        "dislike_count_weighted",
        "like_dislike_ratio",
        "like_dislike_ratio_by_viewcounts_weighted",
    ]
)

In [30]:
nvidia_df["num_days_since_pub"] = np.ceil(
    (nvidia_df["searchedDate"] - nvidia_df["datePub"]) / np.timedelta64(1, "D")
)

In [31]:
nvidia_df["viewcounts_weighted"] = (
    nvidia_df["viewCount"] / nvidia_df["num_days_since_pub"]
)

In [32]:
nvidia_df["like_count_weighted"] = (
    nvidia_df["likeCount"] / nvidia_df["num_days_since_pub"]
)

In [33]:
nvidia_df["like_count_weighted"] = (
    nvidia_df["dislikeCount"] / nvidia_df["num_days_since_pub"]
)