# Can we use Sentiment Analysis to Predict Stock Prices?


In [None]:
!nvidia-smi

In [None]:
!pip install yfinance -q
!pip install tqdm

In [None]:
# As we Scrape all the data with snscrape from twitter I commented this part
!pip install snscrape -q

In [None]:
!pip install transformers -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import plotly.graph_objects as go
from tqdm import tqdm

## Example Pulling Microsoft Stock

In [None]:
msft = yf.Ticker("MSFT")

In [None]:
msft_hist = msft.history(period="max")

In [None]:
msft_hist.head()

In [None]:
msft_hist.info()

In [None]:
msft_hist["Open"].plot(figsize=(15, 5), title="MSFT Stock Price")
plt.show()

In [None]:
stocks = [
    "NVDA",
    "PINS",
    "TSLA",
    "SHOP",
    "O",
    "NKE",
    "GOOG",
    "META",
    "MSFT",
    "AMZN",
    "ZM",
    "PFE",
    "MRNA",
    "SPOT",
    "TMUS",
]

In [None]:
hists = {}
for s in stocks:
    tkr = yf.Ticker(s)
    history = tkr.history(period="3y")
    hists[s] = history

In [None]:
len(hists)

# Plot the Stock Price

In [None]:
for stock in stocks:
    temp_df = hists[stock].copy()

    fig = go.Figure(
        data=[
            go.Candlestick(
                x=temp_df.index,
                open=temp_df["Open"],
                high=temp_df["High"],
                low=temp_df["Low"],
                close=temp_df["Close"],
            )
        ]
    )

    fig.update_layout(
        margin=dict(l=20, r=20, t=60, b=20),
        height=300,
        paper_bgcolor="LightSteelBlue",
        title=stock,
    )

    fig.show()

# Pull Tweets about each Stock

In [None]:
hists["NVDA"].index.min()

In [None]:
stock = "NVDA"

In [None]:
# Source code for scrape the tweets

# # importing libraries and packages
# import snscrape.modules.twitter as sntwitter
# from tqdm.notebook import tqdm

# # Creating list to append tweet data
# tweets_list = []
# # Using TwitterSearchScraper to scrape data and append tweets to list
# for i, tweet in tqdm(
#     enumerate(
#         sntwitter.TwitterSearchScraper(
#             f"${stock} since:2020-02-04 until:2023-05-02"
#         ).get_items()
#     ),
#     total=100_000,
# ):  # declare a username
#     if i > 100_000:  # number of tweets you want to scrape
#         break
#     tweets_list.append(
#         [tweet.date, tweet.id, tweet.content, tweet.user.username]
#     )  # declare the attributes to be returned
# # Creating a dataframe from the tweets list above
# tweet_df = pd.DataFrame(
#     tweets_list, columns=["Datetime", "Tweet Id", "Text", "Username"]
# )

In [None]:
tweet_df=pd.read_csv("/kaggle/input/100k-nvidia-tweets/Nvidia-Tweets.csv")

In [None]:
tweet_df

In [None]:
tweet_df.isnull().sum()

In [None]:
tweet_df.dropna(inplace=True)

In [None]:
tweet_df.shape

In [None]:
# tweet_df.to_csv('Nvidia-Tweets.csv')

In [None]:
# tweet_df.to_parquet("Nvidia_tweets.parquet")

## Sentiment Analysis Prep

In [None]:
from transformers import pipeline

model = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

sentiment_task = pipeline("sentiment-analysis", model=model)
sentiment_task("I love to code in python and it's amazing to use huggingface for sentiment analysis")

In [None]:
sent_results = {}
count = 0
for i, d in tqdm(tweet_df.iterrows(), total=len(tweet_df)):
    sent = sentiment_task(d["Text"])
    sent_results[d["Tweet Id"]] = sent
    count += 1
    if count == 60000:
        break

In [None]:
sent_results

In [None]:
sent_df = pd.DataFrame(sent_results).T
sent_df["label"] = sent_df[0].apply(lambda x: x["label"])
sent_df["score"] = sent_df[0].apply(lambda x: x["score"])
sent_df = sent_df.merge(
    tweet_df.set_index("Tweet Id"), left_index=True, right_index=True
)

In [None]:
sent_df.groupby("label")["score"].plot(kind="hist", bins=50)
plt.legend()
plt.show()

In [None]:
sent_df.sample(10)

In [None]:
sent_df.loc[sent_df["label"] == "Negative", "score_"] = (sent_df.loc[sent_df["label"] == "Negative"]["score"] * -1)

sent_df.loc[sent_df["label"] == "Neutral", "score_"] = 0

In [None]:
sent_df["score_"].plot(kind="hist", bins=50)

In [None]:
sent_df

In [None]:
sent_df['Datetime']= pd.to_datetime(sent_df['Datetime'])

In [None]:
sent_df["Date"] = sent_df["Datetime"].dt.date

In [None]:
sent_df["Date"]

In [None]:
sent_daily = sent_df.groupby("Date")["score_"].mean()
sent_daily

In [None]:
Nvidia_df = hists["NVDA"].copy()
Nvidia_df = Nvidia_df.reset_index()
Nvidia_df["Date"] = Nvidia_df["Date"].dt.date
Nvidia_df = Nvidia_df.set_index("Date")
Nvidia_df

In [None]:
sent_and_stock = sent_daily.to_frame("sentiment").merge(Nvidia_df, left_index=True, right_index=True)
sent_and_stock

In [None]:
ax = sent_and_stock["sentiment"].plot(legend="Sentiment")
ax2 = ax.twinx()
sent_and_stock["Close"].plot(ax=ax2, color="orange", legend="Closing Price")
plt.show()

In [None]:
hists["NVDA"]

In [None]:
sent_df.groupby("Date")["score_"].mean().plot(figsize=(15,5))
