In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stock-tweets-for-sentiment-analysis-and-prediction/stock_yfinance_data.csv
/kaggle/input/stock-tweets-for-sentiment-analysis-and-prediction/stock_tweets.csv


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the stock tweets dataset
df = pd.read_csv('/kaggle/input/stock-tweets-for-sentiment-analysis-and-prediction/stock_tweets.csv')
df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [4]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [5]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

In [6]:
df['preprocessed_text'] = df['Tweet'].apply(preprocess_text)

In [7]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['preprocessed_text'])

In [8]:
# Search function
def search_tweets(query):
    preprocessed_query = preprocess_text(query)
    query_vector = vectorizer.transform([preprocessed_query])
    
    # Calculate cosine similarity between query and tweet texts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Sort tweets based on similarity scores
    sorted_indices = similarities.argsort()[::-1]
    
    # Return sorted tweet data
    sorted_tweets = df.iloc[sorted_indices]
    return sorted_tweets

In [9]:
# Example usage
search_query = "Tesla company "
relevant_tweets = search_tweets(search_query)
print(relevant_tweets[['Tweet']])

                                                   Tweet
9383              There is only one Tesla company\n$TSLA
26104    $TSLA isn’t a car company. It’s a tech company.
1266   $TSLA is the only growth company and the only ...
20941  Not a lot of public company CEOs own 21% of th...
12247  "@tesla is as much a software company as it is...
...                                                  ...
53476  Me go on rust try make goog video off stream w...
53477                                  Buying more $GOOG
53478  $GOOG is trading at &lt;22x LTM, likely &lt;18...
53479  I cut $GOOG at 2920 after the maps were all ov...
40396  AMZN Last 1 min! (1) it was a beauty to observ...

[80793 rows x 1 columns]
