# Functions

In [None]:
'''
Scrapes data from Stock Twits Webiste.

Ticker: what ticker you want to scrape (Ex: NKE, NVDA)
Scrolls: number of scrolls you want to simulate. Would recommend 200-400 to maximaixe data and minimize possible web errors.
Current_max: this is found in the network tab of the stock twits website. Reload the page, then find the json file that appears. Click on the link and find the max value in the "cursor" section.
Save: if true, saves to csv file. if false, does not save but returns df.
'''

def scrape_data(ticker, scrolls, current_max, save):
  dict = {'Message':['Example Message'],
          'Sentiment':['Bullish'],
          'Num_Comments':['0'],
          'Date':['2024-7-15'],
          'Username':['ExampleUsername']
        }

  df = pd.DataFrame(dict)

  for x in range(scrolls):
    url = f'https://api.stocktwits.com/api/2/streams/symbol/{ticker}.json?filter=top&limit=22&max={current_max}'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'}
    r = requests.get(url, headers=headers)
    try:
      data = r.json()
    except:
      break

    for x in range(len(data['messages'])):
      message = data['messages'][x]['body']

      sentiment = data['messages'][x]['entities']['sentiment']
      if sentiment is not None:
          sentiment = sentiment['basic']

      date = data['messages'][x]['created_at']
      date = date.split('T')[0]

      username = data['messages'][x]['user']['username']

      try:
        comments = data['messages'][x]['conversation']['replies']
      except:
        comments = 0

      df.loc[len(df.index)] = [message, sentiment, comments, date, username]

    current_max = data['cursor']['max']

  if save:
    df.to_csv(f'stockTwits_data_{ticker}.csv', index=False)
  else:
    return df

In [None]:
'''
Loads data from csv file. Assumes filename follows format of "stockTwits_data_{ticker}.csv", as saved by scrape_data.

Ticker: what ticker you want to load (Ex: NKE, NVDA)

Returns loaded data as a dataframe
'''

def load_data(ticker):
  df = pd.read_csv(f'stockTwits_data_{ticker}.csv')
  return df

In [None]:
'''
Cleans text of stopwords and unnescessary characters.

df: dataframe to clean

Returns cleaned dataframe
'''

def clean_text(df):
  cleaned_text = []

  # Cleaning all text

  for tweet in df['Message']:
    # Controls for possible floats or ints in text data
    tweet = str(tweet)

    #cleans tickers from text
    ticks = re.findall(r'[$]\w+', tweet)

    for item in ticks:
      tweet = tweet.replace(item, '')

    #cleans @mentions from text
    ments = re.findall(r'[@]\w+', tweet)

    for item in ments:
      tweet = tweet.replace(item, '')

    # changes to lowercase
    tweet = tweet.lower()

    # remove links
    tweet = re.sub(r'http\S+', '', tweet)

    # removes punctuation
    str_map = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    tweet = tweet.translate(str_map)

    # removes numbers
    str_map = str.maketrans('0123456789', ' ' * 10)
    tweet = tweet.translate(str_map)

    # removing extra whitespace
    tweet = re.sub(' +', ' ', tweet)
    tweet = tweet.strip(" ")

    # removes stopwords and long words (links)
    words = re.findall('\w+', tweet)
    filtered_words = [word for word in words if word not in stopwords.words('english') and len(word) < 15]
    cleaned = ' '.join(filtered_words)
    cleaned_text.append(cleaned)

  clean = df.copy()
  clean['Message'] = cleaned_text

  return clean

In [None]:
'''
Filters the provided dataframe by the provided word or phrase and returns the filtered dataframe.

Data: The dataframe you want to filter.
Word (optional): The word or phrase you want to filter by.
Date (optional): The date you want to filter by.
Comments (optional): If true, filters by comments.
'''

def filter_df(data, date=None, word=None, comments=False):
  if date is not None:
    data = data[data['Date'] == date]

  if word is not None:
    data = data[data['Message'].str.contains(word, case=False, na=False)]

  if comments:
    data = data[data.Num_Comments != 0]

  data.reset_index(drop=True, inplace=True)

  return data

In [None]:
'''
Returns a df with only data tagged by the users.
'''

def get_tagged(df):
  tagged_data = df.dropna(subset=["Sentiment"])
  tagged_data.reset_index(drop=True, inplace=True)
  return tagged_data

In [None]:
'''
Ticker: stock to get prices for.

Returns a df of the stock prices over time.
'''

def get_stock_prices(ticker):
  alpha_key = userdata.get('alpha_key')
  #alpha_key = ''

  url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={ticker}&outputsize=full&apikey={alpha_key}'
  r = requests.get(url)
  stock_data = r.json()

  dailies = stock_data['Time Series (Daily)']
  price = []
  for key in dailies.keys():
    price.append([key, dailies[key]['4. close']])

  price = pd.DataFrame(price, columns=['Date', 'Price'])
  return price

In [None]:
'''
Returns the %Bullishness and %Bearishness for each date in the dataframe.
'''

def get_bull_prcts(df):
  dates = []
  bull_prcts = []
  bear_prcts = []

  for date in df['Date']:
    if date not in dates:
      day = df[df['Date'] == date]

      # Can probs do value counts of bullish here instead of weird looping
      sents = day['Sentiment']
      sent = sents.value_counts('Sentiment')

      try:
        bull_prcts.append(sent['Bullish'])
      except:
        bull_prcts.append(0)

      try:
        bear_prcts.append(sent['Bearish'])
      except:
        bear_prcts.append(0)

      dates.append(date)

  prcts = pd.DataFrame(bull_prcts, columns=['Bullish %'])
  prcts.insert(1, "Bearish %", bear_prcts, True)
  prcts.insert(2, "Date", dates, True)
  prcts = prcts.sort_values(by=['Date'])
  return prcts

In [None]:
'''
Creates and shows a graph of the stock price over time and the %Bullishness over time.

Prcts: a df with the %bullishness and dates to be graphed
Price: a df with the stock price over time
Date_range: the range of dates to be graphed
Column: the column to be graphed, bearishness or bullishness
'''

def sentiment_time_graph(prcts, price, date_range, column):
  p_max = max(price['Price'].astype(float))

  fig = make_subplots(specs=[[{"secondary_y": True}]])

  fig.add_trace(
      go.Scatter(x=prcts['Date'], y=prcts[column], fill='tozeroy', mode="lines+markers"), \
      secondary_y=False,
  )

  fig.add_trace(
      go.Scatter(x=price['Date'], y=price['Price'], mode="lines"), \
      secondary_y=True,
  )

  fig.update_xaxes(range = date_range)
  fig.update_yaxes(range=[0,p_max], secondary_y=True)
  fig.update_layout(title={'text': 'Stock Twits Sentiment Over Time, by Day'}, autotypenumbers='convert types')

  fig.show()

In [None]:
'''
Creates and shows a graph of the stock price over time and the %Bullishness over time.

Prcts: a df with the %bullishness and dates to be graphed
Price: a df with the stock price over time
Date_range: the range of dates to be graphed.
'''

def sentiment_time_graph_week(prcts, price, date_range, column):
  p_max = max(price.values.astype(float))

  fig = make_subplots(specs=[[{"secondary_y": True}]])

  dRan1 = pd.date_range(start =date_range[0],
           end =date_range[1], freq ='W')

  fig.add_trace(
      go.Scatter(x=dRan1, y=prcts, fill='tozeroy', mode="lines+markers"), \
      secondary_y=False,
  )

  p1 = int(list(price.keys().strftime("%Y-%m-%d")).index(date_range[0]))
  p2 = int(list(price.keys().strftime("%Y-%m-%d")).index(date_range[1]))

  fig.add_trace(
      go.Scatter(x=dRan1, y=price[p1:p2], mode="lines"), \
      secondary_y=True,
  )

  fig.update_xaxes(range = date_range)
  fig.update_yaxes(range=[0,p_max], secondary_y=True)
  fig.update_layout(title={'text': 'Stock Twits Sentiment Over Time, by Week'}, autotypenumbers='convert types')

  fig.show()

In [None]:
'''
Converts tokenized words back to original text.
'''

def untokenize(ngram):
    tokens = list(ngram)
    return "".join([" "+i if not i.startswith("'") and \
                             i not in string.punctuation and \
                             i != "n't"
                          else i for i in tokens]).strip()

In [None]:
'''
Creates and returns a dataframe of the top n most common words or phrases in the provided dataset.

x: The number of words you want to include in each popular phrase. (Ex: 1=('apple', 'banana'), 2=('apple juice', 'banana pudding'))
Data: The series (df column) you want to get the phrases from.
n: The number of phrases you want to return.
'''

def counts(x, data, n):
  phrase_counter = Counter()

  for message in data:
    for sent in nltk.sent_tokenize(message):
      words = nltk.word_tokenize(sent)
      for phrase in ngrams(words, x):
        phrase_counter[untokenize(phrase)] += 1

  return pd.DataFrame(phrase_counter.most_common(n), columns=['Phrase', 'Count'])

In [None]:
'''
Same as counts, but displays the phrases in a bar graph. Creates a graph for each n in list.
'''

def display_word_counts(n, data, x, title):
  for m in n:
    fig = px.bar(counts(m, data, x), x='Phrase', y='Count')
    fig.update_layout(title={'text': f'{title}, {m} Word Phrases'})
    fig.show()

In [None]:
'''
Graphs the filter by word data over time with stock price.

Data: The dataframe, filtered by a single word, you want to graph.
Date_range: the range of dates to be graphed.
'''

def word_count_plot(data, daterange, price):
  p_max = max(price['Price'].astype(float))

  cnts = data.value_counts(subset='Date')
  dates = cnts.keys().tolist()
  dates = pd.to_datetime(dates)
  cnts = cnts.tolist()

  d_range = pd.date_range(start=dates[0], end=dates[len(dates)-1])

  d = pd.DataFrame(d_range, columns=['Dates'])

  w_cnts = pd.DataFrame(cnts, columns=['Word Mentions'])
  w_cnts.insert(1, "Dates", dates, True)

  new = pd.merge(d, w_cnts, on='Dates', how='left')

  fig = make_subplots(specs=[[{"secondary_y": True}]])

  fig.add_trace(
      go.Bar(x=new['Dates'], y=new['Word Mentions']), \
      secondary_y=False,
  )

  fig.add_trace(
      go.Scatter(x=price['Date'], y=price['Price']), \
      secondary_y=True,
  )

  fig.update_xaxes(range = daterange)
  fig.update_yaxes(range=[0,p_max], secondary_y=True)
  fig.update_layout(autotypenumbers='convert types')

  fig.update_layout(title={'text': "Word Counts Over Time"})

  fig.show()

In [None]:
'''
Tokenizes the train and test dfs and returns the tokenized dfs.
'''

def tokenize_text(train, test):
  tnum = len(train)
  all_data = pd.concat([train, test])
  all_data.reset_index(drop=True, inplace=True)

  corpus = all_data['Message'].values
  countvectorizer = CountVectorizer()
  X = countvectorizer.fit_transform(corpus)
  tokens = X.toarray()
  tokens = tokens.tolist()

  all_data['Message'] = tokens

  train = all_data.iloc[:tnum]
  test = all_data.iloc[tnum:]

  return train, test

In [None]:
'''
Splits training data into x and y
'''

def get_x_y(train_data):
  x = train_data['Message']
  X_data = np.array(x.tolist())
  Y_data = train_data['Sentiment']
  return X_data, Y_data

In [None]:
'''
Fits the logistic regression model on the provided train and test data, then returns the fitted model with metric scores.
'''

def fit_model(X_data, Y_data):
  X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, train_size=0.8, random_state=31)

  scaler = preprocessing.StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test_scaled = scaler.fit_transform(X_test)

  sent_model = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs', class_weight=None)
  sent_model.fit(X_train, Y_train)

  y_pred = sent_model.predict(X_test_scaled)

  prec = precision_score(y_true = Y_test, y_pred = y_pred, pos_label='Bearish')
  rec = recall_score(y_true = Y_test, y_pred = y_pred, pos_label="Bearish")
  acc = accuracy_score(y_true = Y_test, y_pred = y_pred)
  print(f"precision = {prec}")
  print(f"recall = {rec}")
  print(f"accuracy = {acc}")

  return sent_model

In [None]:
'''
Tags tweets with sentiment based on provded model.

Ticker: ticker provided for filename
Sent_model: trained sentiment model
Token_df: a df with tokens to be tagged
Text_df: the same df as token_df but with the tweets instead of tokens
Save: whether or not to save the tagged data
'''

def tag_tweets(ticker, sent_model, token_df, text_df, save):
  null_data = token_df[token_df.isnull().any(axis=1)]
  tagged_data = token_df.dropna(subset=["Sentiment"])

  # Format to-be-tagged data
  null_data_tokens = np.array(null_data['Message'].tolist())
  y_pred = sent_model.predict(null_data_tokens)
  null_data['Sentiment'] = y_pred

  final = pd.concat([tagged_data, null_data]).sort_index()
  final['Message'] = text_df['Message']

  if save:
    # Save overall df with tagged tweets, date, and time
    final.to_csv(f'sent_data_{ticker}.csv')
    files.download(f'sent_data_{ticker}.csv')

  return final

In [None]:
'''
Completely retrains the model and saves if wanted. Add in additional data to improve metrics.
'''

def train_sgd_model(save):
  # Loading all training data
  amd_data = load_data('AMD')
  nvda_data = load_data('NVDA')
  schw_data = load_data('SCHW')
  nke_data = load_data('NKE')
  cere_data = load_data('CERE')
  lly_data = load_data('LLY')
  abbv_data = load_data('ABBV')
  imgn_data = load_data('IMGN')
  cbay_data = load_data('CBAY')
  pfe_data = load_data('PFE')
  azn_data = load_data('AZN')
  jnj_data = load_data('JNJ')
  roku_data = load_data('ROKU')
  mrk_data = load_data('MRK')
  nvs_data = load_data('NVS')
  labp_data = load_data('LABP')
  harp_data = load_data('HARP')
  alpn_data = load_data('ALPN')
  alim_data = load_data('ALIM')
  dcph_data = load_data('DCPH')
  calt_data = load_data('CALT')
  sls_data = load_data('SLS')
  tern_data = load_data('TERN')
  alt_data = load_data('ALT')
  cytk_data = load_data('CYTK')
  krys_data = load_data('KRYS')
  swtx_data = load_data('SWTX')
  bcrx_data = load_data('BCRX')
  adma_data = load_data('ADMA')
  acet_data = load_data('ACET')
  bmea_data = load_data('BMEA')
  bbio_data = load_data('BBIO')
  sava_data = load_data('SAVA')
  celu_data = load_data('CELU')

  # Creating training data dataframe
  training_data = pd.concat([nvda_data, schw_data, nke_data, amd_data, cere_data, lly_data, abbv_data, imgn_data, cbay_data, jnj_data, pfe_data, azn_data, roku_data,
                             mrk_data, nvs_data, labp_data, harp_data, alpn_data, alim_data, dcph_data, calt_data, sls_data, tern_data, alt_data, cytk_data, krys_data,
                             swtx_data, bcrx_data, adma_data, acet_data, bmea_data, bbio_data, sava_data, celu_data])
  cleaned_train = clean_text(training_data)
  tagged_train = get_tagged(cleaned_train)
  tagged_train.drop(index=0, inplace=True)
  tagged_train.reset_index(drop=True, inplace=True)

  # Train/test split
  X_train, X_test, y_train, y_test = train_test_split(tagged_train['Message'], tagged_train['Sentiment'], test_size=0.2, random_state = 31)

  # Training model
  sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-4, random_state=42, max_iter=1000, tol=None)),
               ])

  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  # Metrics
  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=['Bearish', 'Bullish']))

  if save:
    # Save model in drive to prevent retraining if not needed
    model_save_name = 'st_sentiment_sgd.pt'
    path = f"/content/gdrive/My Drive/{model_save_name}"
    torch.save(sgd, path)


In [None]:
'''
tags untagged tweets using the presaved sgd model
'''

def tag_tweets_sgd(data):
  # Load saved model
  model_save_name = 'st_sentiment_sgd.pt'
  path = f"/content/gdrive/My Drive/Stock Twits Project/{model_save_name}"
  sent_model = torch.load(path)

  # Separate null and tagged data
  null_data = data[data.isnull().any(axis=1)]
  tagged_data = data.dropna(subset=["Sentiment"])

  # Format to-be-tagged data
  null_data_messages = np.array(null_data['Message'].tolist())
  y_pred = sent_model.predict(null_data_messages)
  null_data['Sentiment'] = y_pred

  final = pd.concat([tagged_data, null_data]).sort_index()
  return final

In [None]:
'''
Separates Discord Data by Mentioned Ticker
'''

def separate_by_ticker(data, ticker):
  df = data[data['Ticker'] == ticker]
  return df

In [None]:
'''
Gets specific word or phrase counts by day
'''

def get_word_cnts_date(word, ticker):
  data = filter_df(load_data(ticker), word=word)
  cnts = data.value_counts(subset='Date')
  dates = cnts.keys().tolist()
  dates = pd.to_datetime(dates)
  cnts = cnts.tolist()

  d_range = pd.date_range(start=dates[0], end=dates[len(dates)-1])

  d = pd.DataFrame(d_range, columns=['Date'])

  w_cnts = pd.DataFrame(cnts, columns=['Word Mentions'])
  w_cnts.insert(1, "Date", dates, True)

  new = pd.merge(d, w_cnts, on='Date', how='left')
  return new

In [None]:
'''
Setup all variables for graphs
'''

def setup_vars(ticker):
  cleaned = clean_text(load_data(ticker))
  tagged = tag_tweets_sgd(cleaned)
  tagged['Date'] = pd.to_datetime(tagged['Date'])

  try:
    discord_data = pd.read_csv('discord_data_tickers.csv')
    discord_data.drop(index=0, inplace=True)
    discord = separate_by_ticker(discord_data, ticker)
    discord_clean = clean_text(discord)
    tagged_discord = tag_tweets_sgd(discord_clean)
    tagged_discord.reset_index(drop=True, inplace=True)
    tagged_discord['Date'] = pd.to_datetime(tagged_discord['Date'])
  except:
    tagged_discord = None

  price = get_stock_prices(ticker)
  price['Date'] = pd.to_datetime(price['Date'])
  bull_prcts = get_bull_prcts(tagged)
  bull_prcts['Date'] = pd.to_datetime(bull_prcts['Date'])

  try:
    bull_prcts_discord = get_bull_prcts(tagged_discord)
    bull_prcts_discord['Date'] = pd.to_datetime(bull_prcts_discord['Date'])
  except:
    bull_prcts_discord = None

  return tagged, tagged_discord, price, bull_prcts, bull_prcts_discord

In [None]:
'''
Make final df for graphs
'''

def make_df(ticker, d1, d2, bull_prcts, price):
  buy = get_word_cnts_date('buy', ticker)
  fda = get_word_cnts_date('fda', ticker)
  approval = get_word_cnts_date('approval', ticker)

  data1 = pd.merge(bull_prcts, price, on='Date', how='right')
  #data2 = pd.merge(bull_prcts_discord, buy, on='Date', how='right')
  data3 = pd.merge(fda, approval, on='Date', how='right')
  data4 = pd.merge(data1, buy, on='Date', how='left')
  data = pd.merge(data3, data4, on='Date', how='right')

  data = data.loc[(data['Date'] >= d1) & (data['Date'] < d2)]
  data.drop(columns=['Bearish %'], inplace=True)
  #data.columns = ['Bull % ST', 'FDA Mentions', 'Approval Mentions', 'Date', 'Buy Mentions']
  #data = data.iloc[:,[3,4,2,5,1,0]]
  data.dropna(subset=['Price'], inplace=True)
  data.reset_index(drop=True, inplace=True)
  return data

In [None]:
'''
Make array for model
'''

def make_array(ticker, d1, d2, scale):
  tagged, tagged_discord, price, bull_prcts, bull_prcts_discord = setup_vars(ticker)

  if scale:
    scaler = preprocessing.StandardScaler()
    price['Price'] = scaler.fit_transform(price.loc[:, ['Price']])

  df = make_df(ticker, d1, d2, bull_prcts, price)
  df.fillna(0, inplace=True)
  df.drop(columns=['Date'], inplace=True)
  arr = df.to_numpy()
  arr = arr[:45]

  return arr

In [None]:
'''
Generate all graphs
'''

def generate_all(ticker, date_range):
  # Setup all variables needed
  tagged, tagged_discord, price, bull_prcts, bull_prcts_discord = setup_vars(ticker)

  # Daily
  sentiment_time_graph(bull_prcts, price, date_range, 'Bullish %')

  #convert date column to datetime and subtract one week
  bull_prcts['Date'] = pd.to_datetime(bull_prcts['Date'])

  #calculate sum of values, grouped by week
  bull_prcts_week = bull_prcts.groupby([pd.Grouper(key='Date', freq='W')])['Bullish %'].mean()

  #convert date column to datetime and subtract one week
  price['Date'] = pd.to_datetime(price['Date'])

  #calculate sum of values, grouped by week
  price_week = price.groupby([pd.Grouper(key='Date', freq='W')])['Price'].max()

  # Graphs sentiment over time, aggregated by week
  sentiment_time_graph_week(bull_prcts_week.values, price_week, date_range, 'Bullish %')

  # Discord popular words
  display_word_counts([1, 2], tagged_discord['Message'], 10, "Discord Word Counts")

  # ST popular words
  display_word_counts([1, 2, 3, 4], tagged['Message'], 10, "Stock Twits Word Counts")

In [None]:
'''
Get sentiment from news articles mentioning ticker
'''

def get_news_sentiment(ticker, date_from):
  #alpha_key = userdata.get('alpha_key')
  alpha_key = 'KL4FRYQ0EKSFEZ0R'

  url = f'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers={ticker}&time_from={date_from}T0130&limit=500&apikey={alpha_key}'
  r = requests.get(url)
  n_sent = r.json()

  sents = []

  for r in n_sent['feed']:
    for s in r['ticker_sentiment']:
      if s['ticker'] == ticker:
        sents.append(s['ticker_sentiment_score'])

  print(sents)
  sents = list(map(float, sents))
  sent = sum(sents) / len(sents)

  max_sent = max(sents)
  min_sent = min(sents)
  count = len(sents)

  print(f'average sentiment: {sent}, max sentiment: {max_sent}, min sentiment: {min_sent}, number of articles: {count}')
  #return sent, max_sent, min_sent, count

In [None]:
'''
Update old data with new data (discord or ST)
'''

def update_data(old, new, filename):
  first = old.iloc[0]
  for index, row in new.iterrows():
    if list(row) == list(first):
      updated = pd.concat([new[:index], old], axis=0)
      break
  try:
    updated.reset_index(drop=True, inplace=True)
  except:
    updated = pd.concat([new, old], axis=0)
    updated.reset_index(drop=True, inplace=True)

  updated.to_csv(f'{filename}', index=False)

In [None]:
def scrape_discord(scrolls, mess_id, save):
  dict = {'Message':['Example Message'],
          'Date':['2024-7-15'],
          'Username':['ExampleUsername']
        }
  df = pd.DataFrame(dict)

  for x in range(scrolls):
    url = f'https://discord.com/api/v9/channels/968325862166528011/messages?before={mess_id}&limit=100'

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'Authorization': 'OTc2NzI2OTY3NTUyNDcxMTAw.GFQRaE.IHS0dXmXZvyGH25HhqRqXx8ECzVU5ZHRA1_-K4'}
    r = requests.get(url, headers=headers)

    try:
      data = r.json()
    except:
      print(f'error: {r}')

    for y in range(len(data)):
        message = data[y]['content']

        date = data[y]['timestamp']
        date = date.split('T')[0]

        username = data[y]['author']['global_name']

        df.loc[len(df.index)] = [message, date, username]

    mess_id = data[len(data)-1]['id']

  df = df.iloc[1: , :]
  df.reset_index(drop=True, inplace=True)

  if save:
    df.to_csv(f'discord_data_all.csv', index=False)
  else:
    return df

In [None]:
def get_discord_tickers(disc, save):
  dict = {'Message':['Example Message'],
            'Ticker':['PRVB'],
            'Username':['ExampleUsername'],
            'Sentiment':['Bullish'],
            'Date':['2024-7-24']
          }

  ticker_df = pd.DataFrame(dict)

  for mess in disc['Message']:
    try:
      ticks = re.search(r'[A-Z]{3,4}', mess)
    except:
      ticks = None

    if ticks is not None:
      ticker_df.loc[len(ticker_df.index)] = [mess, ticks[0], disc.loc[(disc['Message'] == mess), 'Username'].iloc[0], None, disc.loc[(disc['Message'] == mess), 'Date'].iloc[0]]

  ticker_df = ticker_df.iloc[1: , :]
  ticker_df.reset_index(drop=True, inplace=True)

  if save:
    ticker_df.to_csv(f'discord_data_tickers.csv', index=False)
  else:
    return ticker_df

# Demo

### Imports

In [None]:
!pip install nltk

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, scikeras
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikeras-0.13.0 scikit-learn-1.5.1


In [None]:
import requests

import numpy as np
import pandas as pd

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight

from google.colab import files, userdata

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

from collections import Counter
import functools

import torch

import scikeras

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from datetime import timedelta

from scikeras.wrappers import KerasClassifier

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Now run all functions in functions tab for later use

### Data Prep

Either scrape new data or import old CSVs

In [None]:
# Example scraping -- for new data, change ticker and message #
new_alt = scrape_data('ALT', 100, '582950374', False)

In [None]:
# Example updating old data -- recommend about 10 scrolls/week since last scrape depending on message volume
update_data(load_data('ALT'), new_alt, 'stockTwits_data_ALT.csv')

In [None]:
# Scraping data then separating by ticker
disc = scrape_discord(100, 1272955187287294018, False)
disc_ticks = get_discord_tickers(disc, False)

In [None]:
# Updating ticker data
t = pd.read_csv('discord_data_tickers.csv')
update_data(t, disc_ticks, 'discord_data_tickers.csv')

Make sure to download new data after updating

In [None]:
# Creating correct data format to put in model
alt = make_array('ALT', '2024-05-08', '2024-08-13', True)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data['Sentiment'] = y_pred
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#secu

In [None]:
# Setup all variables needed
tagged, tagged_discord, price, bull_prcts, bull_prcts_discord = setup_vars('ALT')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data['Sentiment'] = y_pred
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#secu

Everything below is basically manually doing what setup_vars does, not necessary unless you specifically need it

In [None]:
# Get Discord message data
discord_data = pd.read_csv('discord_data_tickers.csv')
discord_data.drop(index=0, inplace=True)
d_data = separate_by_ticker(discord_data, 'ALT')
d_data

Unnamed: 0,Message,Ticker,Username,Sentiment,Date
602,<@806186316743573524> \nALT\nspinning top cand...,ALT,RonIsWrong,,2024-07-12
610,I'm really pleased with this little ALT swing....,ALT,RonIsWrong,,2024-07-11
654,swung ALT from yesterday for a 6.5% gain. just...,ALT,RonIsWrong,,2024-07-11
715,watch ALT here!,ALT,RonIsWrong,,2024-07-10
718,weird volume on ALT here,ALT,RonIsWrong,,2024-07-10
...,...,...,...,...,...
20057,ALT up nicely\nI picked up a few more today at...,ALT,RonIsWrong,,2022-06-27
20078,ALT closed at 10.60 - I bought a small handful...,ALT,RonIsWrong,,2022-06-23
20089,ALT continuing today,ALT,RonIsWrong,,2022-06-23
20091,"This ALT chart is making me feel antsy, like i...",ALT,RonIsWrong,,2022-06-23


In [None]:
# Manually loading and cleaning data -- not needed for generate_all
cleaned_alt = clean_text(load_data('ALT'))
tagged_alt = tag_tweets_sgd(cleaned_alt)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data['Sentiment'] = y_pred


In [None]:
# OPTIONAL: Can filter any dataframe by comments if wanted for graphing
comments_alt = filter_df(tagged, comments=True)

In [None]:
# Setting up variables for manual graphing

# Gets %bull per day
bull_prcts = get_bull_prcts(comments_alt)

# Get price per day
price = get_stock_prices('ALT')

### Graphs

In [None]:
load_data('ALT')

Unnamed: 0,Message,Sentiment,Num_Comments,Date,Username
0,Example Message,Bullish,0,2024-7-15,ExampleUsername
1,$ALT short-side circle jerk continues…I took m...,Bullish,1,2024-08-13,Str18BALTStr
2,"$ALT Keep it up shorts, this only makes the ex...",Bullish,0,2024-08-13,RoidRagingBull
3,$ALT hi friends. $5billon please 🙏 let’s go ga...,Bullish,0,2024-08-13,Tradelyfe
4,$ALT 3 Promising Biotech Stocks for the Long-T...,,0,2024-08-13,Article_AI
...,...,...,...,...,...
10994,$ALT basically trending all weekend!! LGF!!,Bullish,1,2024-06-24,Diamondhands2_10Billy
10995,$ALT Interactive Brokers. 1 minute ago.,,0,2024-06-24,starbreaker
10996,$ALT GARG said Pemvi available in sometime in ...,,1,2024-06-24,nycmax
10997,$ALT I believe the benefits of the molecule ov...,Bullish,0,2024-06-24,drnopain


In [None]:
# Getting correct dates for graph generation, only need to change last_sunday value. Can also change weeks value
last_sunday = '8/11/2024'

now = pd.to_datetime(last_sunday)
past = now - timedelta(weeks=8)

now = now.strftime('%Y-%m-%d')
past = past.strftime('%Y-%m-%d')

In [None]:
# Generate all graphs (use Sundays for date range)
generate_all('ALT', [past, now])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data['Sentiment'] = y_pred
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#secu

In [None]:
# Show a specific word count over time

# Filters by specified word/phrase
filt = filter_df(clean_text(load_data('ALT')), word='short')

# Graphs word data over time
word_count_plot(filt, [past, now], price)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [None]:
# Gets df filtered on the date and/or word
#filter_df(load_data('ALT'), date='2024-06-24', word='shorts')
filter_df(load_data('ALT'), word='multinational')

Unnamed: 0,Message,Sentiment,Num_Comments,Date,Username
0,$ALT Once partnership with a multinational pha...,Bullish,0,2024-06-27,GreenEnergy2022
1,"$ALT Thats nothing new. Phs3 trial is big, eve...",Bullish,4,2024-06-27,GreenEnergy2022
2,$ALT Where does it says the conference is canc...,Bullish,1,2024-06-26,GreenEnergy2022
3,$ALT Participating in &quot;The Piper Sandler ...,Bullish,0,2024-06-25,Bull65
4,$ALT 👈🏼said they’re looking for a multinationa...,,2,2024-06-24,R428
5,$ALT \n\nhttps://finance.yahoo.com/video/altim...,,0,2024-06-24,CallThemOut
6,"$ALT With this kind of data, best in class, th...",Bullish,0,2024-06-24,GreenEnergy2022


In [None]:
# Check the news sentiment (date is YYYYMMDD)
# "x <= -0.35: Bearish; -0.35 < x <= -0.15: Somewhat-Bearish; -0.15 < x < 0.15: Neutral; 0.15 <= x < 0.35: Somewhat_Bullish; x >= 0.35: Bullish"
get_news_sentiment('ALT', '20240508')

['0.186736', '0.010785', '0.270855', '0.248264', '0.021307', '0.146997', '0.121016', '0.020054', '0.04146', '0.108381', '0.181477', '0.046311', '0.190028', '0.089617', '0.012812', '0.020211', '0.00869', '0.033861', '0.128335', '0.373313', '0.344408', '-0.092329', '-0.012964', '-0.087696', '0.167729', '0.014116', '0.065763', '0.021405', '0.226599', '-0.007976', '0.127813', '-0.144708', '0.022447', '0.020211', '0.077358', '0.220919', '0.077944', '-0.103369', '0.493297', '0.012132', '-0.096653', '-0.087696', '-0.092329', '-0.012964', '0.023221', '0.423225', '0.224234', '0.0', '0.0', '0.1648']
average sentiment: 0.08498893999999999, max sentiment: 0.493297, min sentiment: -0.144708, number of articles: 50


In [None]:
# Manually creating sentiment by day and week graphs

# Daily
sentiment_time_graph(bull_prcts, price, [past, now], 'Bullish %')

#convert date column to datetime and subtract one week
bull_prcts['Date'] = pd.to_datetime(bull_prcts['Date'])

#calculate sum of values, grouped by week
bull_prcts_week = bull_prcts.groupby([pd.Grouper(key='Date', freq='W')])['Bullish %'].mean()

#convert date column to datetime and subtract one week
price['Date'] = pd.to_datetime(price['Date'])

#calculate sum of values, grouped by week
price_week = price.groupby([pd.Grouper(key='Date', freq='W')])['Price'].max()

# Graphs sentiment over time, aggregated by week
sentiment_time_graph_week(bull_prcts_week.values, price_week, [past, now], 'Bullish %')


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [None]:
# Manually creating top word/phrase bar chart
display_word_counts([1, 2, 3, 4], comments_alt['Message'], 10, title='Word Counts')

### Model

In [None]:
# Load the model from your drive -- make you you import shared folder

model_save_name = 'lstm_buyout_scaled.pt'
path = f"/content/gdrive/My Drive/Stock Twits Project/{model_save_name}"
model_scale = torch.load(path)

In [None]:
# Get the probability and prediction of a buyout

y_probs = model_scale.predict(alt[None])

 # Set a higher threshold to increase precision (sort of). 0.50=best recall, 0.55=best precision (?)
threshold = 0.55
y_pred = (y_probs >= threshold).astype(int)

print(f'probability: {y_probs[0][0]}, prediction: {y_pred[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387ms/step
probability: 0.3533514142036438, prediction: 0


# Discord Scraping

In [None]:
dict = {'Message':['Example Message'],
          'Date':['2024-7-15'],
          'Username':['ExampleUsername']
        }

df = pd.DataFrame(dict)
df

Unnamed: 0,Message,Date,Username
0,Example Message,2024-7-15,ExampleUsername


In [None]:
# Change numbers in url to channel nums, and might have to change authorization to Keith's depending on if it works
# url =   f'https://discord.com/api/v9/channels/CHANNEL_NUM/messages?before={mess_id}&limit=100'

mess_id = 1265759209321005247

for x in range(1000):
  url = f'https://discord.com/api/v9/channels/{CHANNEL_NUM}/messages?before={mess_id}&limit=100'

  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'Authorization': 'OTc2NzI2OTY3NTUyNDcxMTAw.GFQRaE.IHS0dXmXZvyGH25HhqRqXx8ECzVU5ZHRA1_-K4'}
  r = requests.get(url, headers=headers)

  try:
    data = r.json()
  except:
    print(f'error: {r}')

  for y in range(len(data)):
      message = data[y]['content']

      date = data[y]['timestamp']
      date = date.split('T')[0]

      username = data[y]['author']['global_name']

      df.loc[len(df.index)] = [message, date, username]

  mess_id = data[len(data)-1]['id']

IndexError: list index out of range

In [None]:
df

Unnamed: 0,Message,Date,Username
0,Example Message,2024-7-15,ExampleUsername
1,Didn’t realize TARS had 22% short interest,2024-07-24,WS_Axelrod
2,GTHX made quite the comeback. wow.,2024-07-24,RonIsWrong
3,,2024-07-24,RonIsWrong
4,ACAD volume spikes,2024-07-24,RonIsWrong
...,...,...,...
57221,$XBI $IBB - Biopharma Appears Generally Defens...,2022-04-26,HoRiZon
57222,,2022-04-26,HoRiZon
57223,🔥🔥🔥,2022-04-26,teXan
57224,Wow. Nice!,2022-04-26,RonIsWrong


In [None]:
df.to_csv(f'discord_data_all.csv', index=False)

In [None]:
df = pd.read_csv('discord_data_all.csv')

In [None]:
dict = {'Message':['Example Message'],
          'Ticker':['PRVB'],
          'Username':['ExampleUsername'],
          'Sentiment':['Bullish'],
          'Date':['2024-7-24']
        }

ticker_df = pd.DataFrame(dict)
ticker_df

Unnamed: 0,Message,Ticker,Username,Sentiment,Date
0,Example Message,PRVB,ExampleUsername,Bullish,2024-7-24


In [None]:
for mess in df['Message']:
  try:
    ticks = re.search(r'[A-Z]{3,4}', mess)
  except:
    ticks = None

  if ticks is not None:
    ticker_df.loc[len(ticker_df.index)] = [mess, ticks[0], df.loc[(df['Message'] == mess), 'Username'].iloc[0], None, df.loc[(df['Message'] == mess), 'Date'].iloc[0]]
ticker_df

Unnamed: 0,Message,Ticker,Username,Sentiment,Date
0,Example Message,PRVB,ExampleUsername,Bullish,2024-7-24
1,Didn’t realize TARS had 22% short interest,TARS,WS_Axelrod,,2024-07-24
2,GTHX made quite the comeback. wow.,GTHX,RonIsWrong,,2024-07-24
3,ACAD volume spikes,ACAD,RonIsWrong,,2024-07-24
4,NVCR is holding up pretty well today with earn...,NVCR,DonCorleone77,,2024-07-24
...,...,...,...,...,...
20186,SURF on a move today...they present this weeke...,SURF,MkRizzle,,2022-05-02
20187,IDYA Orphan Drug Designation http://news...,IDYA,MkRizzle,,2022-05-02
20188,ATNF news of Friday http://newsfile.refiniti...,ATNF,MkRizzle,,2022-05-02
20189,I feel like I should say F* biotech about ever...,EAR,RonIsWrong,,2022-04-29


In [None]:
ticker_df.to_csv(f'discord_data_tickers.csv', index=False)

In [None]:
test = ticker_df[ticker_df['Ticker'] == 'PRVB']
test

Unnamed: 0,Message,Ticker,Sentiment,Date
0,Example Message,PRVB,Bullish,2024-7-24
574,remember sanofi buying PRVB obviously,PRVB,,2024-07-12
2745,Only ones I can think of that did this are PRV...,PRVB,,2024-05-30
3788,sanofi deal is good - they take ex-US.\nI had ...,PRVB,,2024-05-14
6391,they bought PRVB,PRVB,,2024-04-10
6393,mgnx provided the product for PRVB that got bo...,PRVB,,2024-04-10
9634,This reminds me of PRVB right before it got BO,PRVB,,2024-02-29
11825,PRVB was sold - awesome buyout,PRVB,,2024-02-03
12902,I sold PRVB like the Friday before! BO on a Mo...,PRVB,,2024-01-09
16783,Also think it’s interesting Sanofi owns 7%…the...,PRVB,,2023-06-14
