<a href="https://colab.research.google.com/github/Ctm31/Stock-Twits-Sentiment-Analysis/blob/main/stock_twits_sent_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepping Big Datatable From CSVs

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Getting text from scraper CSV
prvb_text = pd.read_csv('/content/stockTwits_text_PRVB.csv')
prvb_text

Unnamed: 0,Text,Sentiment
0,$PDSB @Jwa68 take a look here now!! Might be $...,
1,"$ARDX $PRVB Cass I love your DD confidence, p...",
2,"$ARDX Calling the smarter, more experienced lo...",
3,$IBRX this is playing like $PRVB to the “T” lo...,Bullish
4,$SGMT Thierry Chauche made $10M on $PRVB opti...,Bullish
...,...,...
2564,$PRVB keeping my expectations low but will be ...,
2565,$PRVB - Waiting for Sanofi to up its game. We...,Bullish
2566,$PRVB No justice - no peace! Lets protest in f...,
2567,$PRVB PRVB is a perfect example of how one ent...,


In [None]:
# Getting dates from scraper CSV
prvb_dates = pd.read_csv('/content/stockTwits_dates_PRVB.csv')
prvb_dates = prvb_dates.drop(prvb_dates[prvb_dates['Date'] == 'D'].index)
prvb_dates = prvb_dates.reset_index(drop=True)
prvb_dates

Unnamed: 0,Date,Time
0,2024-06-25,18:12:19Z
1,2024-05-23,13:39:45Z
2,2024-05-22,15:54:10Z
3,2024-05-20,14:20:18Z
4,2024-05-12,13:39:43Z
...,...,...
2586,2023-02-13,20:04:32Z
2587,2023-02-13,19:47:40Z
2588,2023-02-13,19:34:02Z
2589,2023-02-13,19:27:38Z


In [None]:
prvb_data = pd.concat([prvb_text, prvb_dates], axis=1)
prvb_data

Unnamed: 0,Text,Sentiment,Date,Time
0,$PDSB @Jwa68 take a look here now!! Might be $...,,2024-06-25,18:12:19Z
1,"$ARDX $PRVB Cass I love your DD confidence, p...",,2024-05-23,13:39:45Z
2,"$ARDX Calling the smarter, more experienced lo...",,2024-05-22,15:54:10Z
3,$IBRX this is playing like $PRVB to the “T” lo...,Bullish,2024-05-20,14:20:18Z
4,$SGMT Thierry Chauche made $10M on $PRVB opti...,Bullish,2024-05-12,13:39:43Z
...,...,...,...,...
2586,,,2023-02-13,20:04:32Z
2587,,,2023-02-13,19:47:40Z
2588,,,2023-02-13,19:34:02Z
2589,,,2023-02-13,19:27:38Z


In [None]:
# Scraper collects extra dates, shortening to match text length
prvb_data = prvb_data.dropna(subset=['Text'])
prvb_data

Unnamed: 0,Text,Sentiment,Date,Time
0,$PDSB @Jwa68 take a look here now!! Might be $...,,2024-06-25,18:12:19Z
1,"$ARDX $PRVB Cass I love your DD confidence, p...",,2024-05-23,13:39:45Z
2,"$ARDX Calling the smarter, more experienced lo...",,2024-05-22,15:54:10Z
3,$IBRX this is playing like $PRVB to the “T” lo...,Bullish,2024-05-20,14:20:18Z
4,$SGMT Thierry Chauche made $10M on $PRVB opti...,Bullish,2024-05-12,13:39:43Z
...,...,...,...,...
2564,$PRVB keeping my expectations low but will be ...,,2023-02-14,15:08:23Z
2565,$PRVB - Waiting for Sanofi to up its game. We...,Bullish,2023-02-14,14:56:12Z
2566,$PRVB No justice - no peace! Lets protest in f...,,2023-02-14,14:39:40Z
2567,$PRVB PRVB is a perfect example of how one ent...,,2023-02-14,14:20:54Z


In [None]:
# Text from other ticker purely for training ML model
train_text = pd.read_csv('/content/stockTwits_text.csv')
train_text

Unnamed: 0,Text,Sentiment
0,"Hello beautiful people, we’re doing an AMA! ...",
1,$NVDA $130 becoming a new support and consol...,Bullish
2,$NVDA $130 will become a new support.,Bullish
3,$NVDA buy,Bullish
4,"$MSFT Fresh lows, what a move down today - Mag...",
...,...,...
4390,$NVDA Bubble gonna burst or will it keep going...,
4391,$NVDA Very BULLISH,Bullish
4392,$NVDA NVIDIA CORP : Buy rating from UBSJuly 08...,Bullish
4393,$NVDA Lets F go and her an 10$ day Green Day T...,


# Cleaning + Prepping Text for ML

In [None]:
cleaned_text_train = []

# Cleaning all text

for tweet in train_text['Text']:
  #cleans tickers from text
  ticks = re.findall(r'[$]\w+', tweet)

  for item in ticks:
    tweet = tweet.replace(item, '')

  #cleans @mentions from text
  ments = re.findall(r'[@]\w+', tweet)

  for item in ments:
    tweet = tweet.replace(item, '')

  # changes to lowercase
  tweet = tweet.lower()

  # removes punctuation
  str_map = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
  tweet = tweet.translate(str_map)

  # removes numbers
  str_map = str.maketrans('0123456789', ' ' * 10)
  tweet = tweet.translate(str_map)

  # removing extra whitespace
  tweet = re.sub(' +', ' ', tweet)
  tweet = tweet.strip(" ")

  # removes stopwords and long words (links)
  words = re.findall('\w+', tweet)
  filtered_words = [word for word in words if word not in stopwords.words('english') and len(word) < 15]
  cleaned = ' '.join(filtered_words)
  cleaned_text_train.append(cleaned)

In [None]:
cleaned_text_prvb = []

for tweet in prvb_data['Text']:
  #cleans tickers from text
  ticks = re.findall(r'[$]\w+', tweet)

  for item in ticks:
    tweet = tweet.replace(item, '')

  #cleans @mentions from text
  ments = re.findall(r'[@]\w+', tweet)

  for item in ments:
    tweet = tweet.replace(item, '')

  # changes to lowercase
  tweet = tweet.lower()

  # removes punctuation
  str_map = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
  tweet = tweet.translate(str_map)

  # removes numbers
  str_map = str.maketrans('0123456789', ' ' * 10)
  tweet = tweet.translate(str_map)

  # removing extra whitespace
  tweet = re.sub(' +', ' ', tweet)
  tweet = tweet.strip(" ")

  # removes stopwords and long words (links)
  words = re.findall('\w+', tweet)
  filtered_words = [word for word in words if word not in stopwords.words('english') and len(word) < 15]
  cleaned = ' '.join(filtered_words)
  cleaned_text_prvb.append(cleaned)

In [None]:
cleaned_df = pd.DataFrame([cleaned_text_train, train_text['Sentiment']])
cleaned_df2 = pd.DataFrame([cleaned_text_prvb, prvb_data['Sentiment']])

# Creating df of all text data from both tickers
df = pd.concat([cleaned_df, cleaned_df2], axis=1)
df = df.transpose()
df.columns = ['Text', 'Sentiment']
df

Unnamed: 0,Text,Sentiment
0,hello beautiful people ama started peloswing f...,
1,becoming new support consolidating well suppor...,Bullish
2,become new support,Bullish
3,buy,Bullish
4,fresh lows move today mag losing steam msft le...,
...,...,...
2564,keeping expectations low interesting see happe...,
2565,waiting sanofi game official discount,Bullish
2566,justice peace lets protest front sec hq,
2567,prvb perfect example one entity corner market ...,


In [None]:
# Tokenize all text
corpus = df['Text'].values
countvectorizer = CountVectorizer()
X = countvectorizer.fit_transform(corpus)
tokens = X.toarray()
tokens = tokens.tolist()

df['Text'] = tokens

In [None]:
df

Unnamed: 0,Text,Sentiment
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
...,...,...
2564,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
2565,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
2566,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
2567,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
df = df.reset_index()
df

Unnamed: 0,index,Text,Sentiment
0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
2,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
...,...,...,...
6959,2564,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6960,2565,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6961,2566,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6962,2567,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
df[6950:6964]

Unnamed: 0,index,Text,Sentiment
6950,2555,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6951,2556,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6952,2557,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6953,2558,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6954,2559,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6955,2560,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6956,2561,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6957,2562,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6958,2563,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6959,2564,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
# Getting only the data from the to-be-tagged, non-training stock
new = df.iloc[-2569:]
new = new.drop("index", axis = 1)
new

Unnamed: 0,Text,Sentiment
4395,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4396,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4397,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4398,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
4399,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
...,...,...
6959,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6960,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
6961,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
6962,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
# Get null, to be tagged data
null_data = new[new.isnull().any(axis=1)]
null_data.reset_index(drop=True, inplace=True)
null_data

Unnamed: 0,Text,Sentiment
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
...,...,...
1654,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1655,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1656,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
# Getting all training data
df = df.dropna(subset=["Sentiment"])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,index,Text,Sentiment
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
2,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3,5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
4,6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bearish
...,...,...,...
3397,2557,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3398,2559,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3399,2561,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3400,2563,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish


In [None]:
x = df['Text']
X_data = np.array(x.tolist())

In [None]:
X_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
Y_data = df['Sentiment']

# Real Real ML

In [None]:
# Check the data skew
bearcount = df['Sentiment'].value_counts()['Bearish']
bullcount = df['Sentiment'].value_counts()['Bullish']
print(bearcount)
print(bullcount)

685
2717


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score


import warnings

In [None]:
# Setting up logistic regression with scaling and hyperparameter testing
def logi_reg_CV(X, Y):

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=31)

  scaler = preprocessing.StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.fit_transform(X_test)

  parameters = [
    {
      'penalty' : ['none'],
    },
    {
      'penalty' : ['l2'],
      'C'       : (0.001, 0.01, 1, 10, 100, 1000),
    },
    {
        'penalty' : ['l1'],
        'C'       : (0.001, 0.01, 1, 10, 100, 1000),
        'solver'  : ['liblinear']
    }

  ]

  logreg = LogisticRegression()
  clf = GridSearchCV(logreg,
                   param_grid = parameters,
                   scoring='accuracy',
                   cv=10)

  clf.fit(X_train, Y_train)

  print("Tuned Hyperparameters :", clf.best_params_)
  print("Accuracy :",clf.best_score_)

In [None]:
logi_reg_CV(X_data, Y_data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Tuned Hyperparameters : {'C': 0.01, 'penalty': 'l2'}
Accuracy : 0.8607183257918554


In [None]:
# Input best parameters to run true logreg
sent_model = LogisticRegression(C=0.01, penalty='l2')
sent_model.fit(X_data, Y_data)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import torch

In [None]:
# Save model in drive to prevent retraining if not needed
model_save_name = 'st_sentiment.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
torch.save(sent_model, path)

# Use saved model for sentiment analysis and actual data work

In [None]:
# Load saved model
model_save_name = 'st_sentiment.pt'
path = f"/content/gdrive/My Drive/{model_save_name}"
sent_model = torch.load(path)

In [None]:
null_data

Unnamed: 0,Text,Sentiment
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
...,...,...
1654,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1655,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1656,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [None]:
# Format to-be-tagged data
null_data_tokens = np.array(null_data['Text'].tolist())
y_pred = sent_model.predict(null_data_tokens)

In [None]:
len(y_pred)

1659

In [None]:
null_data['Sentiment'] = y_pred
null_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data['Sentiment'] = y_pred


Unnamed: 0,Text,Sentiment
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
...,...,...
1654,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
1655,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
1656,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish
1657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Bullish


In [None]:
# Check number of bearish tags to make sure skew isn't too strong
ex = null_data['Sentiment'].values == 'Bearish'
for val in ex:
  if(val == True):
    print(val)

True


In [None]:
cleaned_df2 = cleaned_df2.transpose()
cleaned_df2.columns = ['Text', 'Sentiment']
cleaned_df2

Unnamed: 0,Text,Sentiment
0,take look might type situation,
1,cass love dd confidence particularly capital l...,
2,calling smarter experienced longs argument mad...,
3,playing like love hodl brothers,Bullish
4,thierry chauche made options landing deal year...,Bullish
...,...,...
2564,keeping expectations low interesting see happe...,
2565,waiting sanofi game official discount,Bullish
2566,justice peace lets protest front sec hq,
2567,prvb perfect example one entity corner market ...,


In [None]:
nulls = cleaned_df2[cleaned_df2.isnull().any(axis=1)]
nulls['Sentiment'] = y_pred
nulls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nulls['Sentiment'] = y_pred


Unnamed: 0,Text,Sentiment
0,take look might type situation,Bullish
1,cass love dd confidence particularly capital l...,Bullish
2,calling smarter experienced longs argument mad...,Bullish
7,trying understand dennis hom left last july ri...,Bullish
9,enterprise value ev total year analyst consens...,Bullish
...,...,...
2562,good question paying trading room free trading...,Bullish
2564,keeping expectations low interesting see happe...,Bullish
2566,justice peace lets protest front sec hq,Bullish
2567,prvb perfect example one entity corner market ...,Bullish


In [None]:
sents = cleaned_df2.dropna(subset=["Sentiment"])
sents

Unnamed: 0,Text,Sentiment
3,playing like love hodl brothers,Bullish
4,thierry chauche made options landing deal year...,Bullish
5,doesnt mess around look shocked sanofi wiped n...,Bullish
6,finally permanent cfo proper track record conn...,Bullish
8,doubt price actions sucks reminds holding six ...,Bullish
...,...,...
2557,good see least find sellers fake prices much l...,Bullish
2559,get big breakout volume,Bullish
2561,com episode,Bullish
2563,let go get want big tremendous gains little pe...,Bullish


In [None]:
# Re-insert tagged tweets into df with orginally tagged tweets
result = pd.concat([sents, nulls]).sort_index()
result

Unnamed: 0,Text,Sentiment
0,take look might type situation,Bullish
1,cass love dd confidence particularly capital l...,Bullish
2,calling smarter experienced longs argument mad...,Bullish
3,playing like love hodl brothers,Bullish
4,thierry chauche made options landing deal year...,Bullish
...,...,...
2564,keeping expectations low interesting see happe...,Bullish
2565,waiting sanofi game official discount,Bullish
2566,justice peace lets protest front sec hq,Bullish
2567,prvb perfect example one entity corner market ...,Bullish


# Saving final df

In [None]:
result

Unnamed: 0,Text,Sentiment
0,take look might type situation,Bullish
1,cass love dd confidence particularly capital l...,Bullish
2,calling smarter experienced longs argument mad...,Bullish
3,playing like love hodl brothers,Bullish
4,thierry chauche made options landing deal year...,Bullish
...,...,...
2564,keeping expectations low interesting see happe...,Bullish
2565,waiting sanofi game official discount,Bullish
2566,justice peace lets protest front sec hq,Bullish
2567,prvb perfect example one entity corner market ...,Bullish


In [None]:
prvb_dates

Unnamed: 0,Date,Time
0,2024-06-25,18:12:19Z
1,2024-05-23,13:39:45Z
2,2024-05-22,15:54:10Z
3,2024-05-20,14:20:18Z
4,2024-05-12,13:39:43Z
...,...,...
2586,2023-02-13,20:04:32Z
2587,2023-02-13,19:47:40Z
2588,2023-02-13,19:34:02Z
2589,2023-02-13,19:27:38Z


In [None]:
new = pd.concat([result, prvb_dates], axis=1)
new = new.dropna(subset=['Sentiment'])
new

Unnamed: 0,Text,Sentiment,Date,Time
0,take look might type situation,Bullish,2024-06-25,18:12:19Z
1,cass love dd confidence particularly capital l...,Bullish,2024-05-23,13:39:45Z
2,calling smarter experienced longs argument mad...,Bullish,2024-05-22,15:54:10Z
3,playing like love hodl brothers,Bullish,2024-05-20,14:20:18Z
4,thierry chauche made options landing deal year...,Bullish,2024-05-12,13:39:43Z
...,...,...,...,...
2564,keeping expectations low interesting see happe...,Bullish,2023-02-14,15:08:23Z
2565,waiting sanofi game official discount,Bullish,2023-02-14,14:56:12Z
2566,justice peace lets protest front sec hq,Bullish,2023-02-14,14:39:40Z
2567,prvb perfect example one entity corner market ...,Bullish,2023-02-14,14:20:54Z


In [None]:
# Save overall df with tagged tweets, date, and time
from google.colab import files

new.to_csv('sent_data_PRVB.csv')
files.download('sent_data_PRVB.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>