In [60]:
import pandas as pd
import re

In [78]:
# Load tweets from the CSV file where we previously stored them
df = pd.read_csv("ai_tweets.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,Tweet,created_at
0,@cryptogems555 Invest in a currency that is de...,3/14/2025
1,While having all the hope around chatGPT lets ...,3/14/2025
2,@IGDEFI1 Join the movement towards a more sust...,3/14/2025
3,"There's a snowstorm out so naturally, I decide...",3/14/2025
4,@unclebobcrypto @Stablzone Invest in a currenc...,3/14/2025


In [80]:
#cleaning tweets using regular expression "regex"

df = df[~df['created_at'].astype(str).str.contains("created_at", na=False)]
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'].notna()]

df.drop_duplicates(inplace=True)

df.dropna(subset=['Tweet', 'created_at'], inplace=True)

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # to remove URLs
    text = re.sub(r"@\w+", "", text)    # remove mentions
    text = re.sub(r"#\w+", "", text)     #remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # keep only words & whitespace
    text = re.sub(r"\s+", " ", text)     # collapse multiple spaces
    return text.strip().lower()          # Trim & lowercase

df['cleaned_text'] = df['Tweet'].apply(clean_text)
df.drop_duplicates(subset=['cleaned_text'], inplace=True)

df['Tweet'] = df['cleaned_text']
df.drop(columns=['cleaned_text'], inplace=True)

df.head()

Unnamed: 0,Tweet,created_at
0,invest in a currency that is designed to offer...,2025-03-14
1,while having all the hope around chatgpt lets ...,2025-03-14
2,join the movement towards a more sustainable a...,2025-03-14
3,theres a snowstorm out so naturally i decided ...,2025-03-14
4,invest in a currency that offers a more secure...,2025-03-14


In [82]:
df.shape

(123746, 2)

In [84]:
na_counts = df.isna().sum()
print(na_counts)


Tweet         0
created_at    0
dtype: int64


In [86]:
duplicate_counts = df.duplicated().sum()
duplicate_counts

0

In [98]:
# Store into MongoDB
from pymongo import MongoClient

#Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['ai_tweets_db']
collection = db['tweets']

#convert DataFrame to list of dicts for mongodb insertion
data_dict = df.to_dict("records")

#we make sure that there aren't any tweets already existing in MongoDB to avoid duplicates
existing_tweets = set(doc['Tweet'] for doc in collection.find({}, {'Tweet': 1}))

new_tweets = [doc for doc in data_dict if doc['Tweet'] not in existing_tweets]

if new_tweets:
    collection.insert_many(new_tweets)
    print(f"{len(new_tweets)} new tweets inserted into MongoDB")
else:
    print("No new tweets to insert")

No new tweets to insert


In [69]:
# number of tweets in mongodb
count = collection.count_documents({})
print(f"num. of docs.: {count}")

num. of docs.: 123746


In [96]:
# display first 5 tweets from MongoDB
for doc in collection.find().limit(5):
    print(doc)

{'_id': ObjectId('680cfd33938747c9e17a5a88'), 'Tweet': 'invest in a currency that is designed to offer greater flexibility and customization for personal financial strategies and goals with this coin', 'created_at': datetime.datetime(2025, 3, 14, 0, 0), 'vader_score': 0.5994, 'sentiment': 'Positive'}
{'_id': ObjectId('680cfd33938747c9e17a5a89'), 'Tweet': 'while having all the hope around chatgpt lets not forget these tools in other domains too', 'created_at': datetime.datetime(2025, 3, 14, 0, 0), 'vader_score': 0.5523, 'sentiment': 'Positive'}
{'_id': ObjectId('680cfd33938747c9e17a5a8a'), 'Tweet': 'join the movement towards a more sustainable and responsible financial system that prioritizes longterm growth and development 0x73706a7d4c34b3c70a1cd35030b847a0e11403e0', 'created_at': datetime.datetime(2025, 3, 14, 0, 0), 'vader_score': 0.7479, 'sentiment': 'Positive'}
{'_id': ObjectId('680cfd33938747c9e17a5a8b'), 'Tweet': 'theres a snowstorm out so naturally i decided to ask about ð', 'cr