In [18]:
import sqlite3

In [19]:
connect = sqlite3.connect('database.sqlite')
cur = connect.cursor()

In [20]:
!pip install pandas



# Task 4.1

In [21]:
!pip install gensim



In [22]:
import sqlite3, pandas as pd, re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

con = sqlite3.connect("database.sqlite")

posts = pd.read_sql_query("SELECT content FROM posts", con)
comments = pd.read_sql_query("SELECT content FROM comments", con)
texts = pd.concat([posts["content"], comments["content"]], ignore_index=True).fillna("").astype(str)

vec = CountVectorizer(lowercase=True, stop_words='english', max_df=0.40, min_df=5)
X = vec.fit_transform(texts)

lda = LatentDirichletAllocation(n_components=10, learning_method='batch', random_state=42, max_iter=20)
lda.fit(X)

vocab = vec.get_feature_names_out()
for k, comp in enumerate(lda.components_):
    top = comp.argsort()[::-1][:10]
    print(f"Topic {k}: " + ", ".join(vocab[i] for i in top))



Topic 0: like, just, feels, wow, feel, people, hard, seriously, really, trying
Topic 1: just, people, let, real, like, fashion, need, seriously, big, hype
Topic 2: love, amazing, remember, new, similar, curious, year, perspective, change, like
Topic 3: think, just, let, maybe, real, people, isn, need, bit, important
Topic 4: like, just, time, haha, new, perfect, sounds, good, day, best
Topic 5: post, really, sharing, thanks, amazing, reading, makes, hit, truly, little
Topic 6: like, remember, nature, just, tried, ended, time, sounds, hey, day
Topic 7: music, new, exploring, vegan, just, hidden, mind, time, food, like
Topic 8: great, community, project, diy, remember, garden, pushing, energy, ve, just
Topic 9: just, like, maybe, book, time, try, feeling, oh, sounds, seriously


# Task 4.2


In [23]:
import sqlite3
import re
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# --- setup ---
nltk.download("vader_lexicon", quiet=True)
dbPath = "database.sqlite"

def cleanText(t: str) -> str:
    t = t.lower()
    t = re.sub(r"http\S+|@\S+|#[\w-]+", " ", t)
    t = re.sub(r"[^a-z\s']", " ", t)
    return re.sub(r"\s+", " ", t).strip()

def toBucket(c: float) -> str:
    if c >= 0.05: return "pos"
    if c <= -0.05: return "neg"
    return "neu"

with sqlite3.connect(dbPath) as conn:
    posts = pd.read_sql("SELECT id, content FROM posts", conn)
    comments = pd.read_sql("SELECT id, content FROM comments", conn)

df = pd.concat(
    [
        posts.rename(columns={"content": "text"})[["text"]].assign(kind="post"),
        comments.rename(columns={"content": "text"})[["text"]].assign(kind="comment"),
    ],
    ignore_index=True,
)

df = df.dropna(subset=["text"]).copy()
df["text"] = df["text"].astype(str).apply(cleanText)

sia = SentimentIntensityAnalyzer()
df["compound"] = df["text"].apply(lambda t: sia.polarity_scores(t)["compound"])
df["bucket"] = df["compound"].apply(toBucket)

#  tone
overallMean = df["compound"].mean()
mix = df["bucket"].value_counts(normalize=True).mul(100)

print("\n>>>>>>>>>Overall Platform Tone<<<<<<<<<<<<< ")
print(f"Docs analyzed: {len(df)}")
print(f"Average sentiment (compound): {overallMean:.3f}")
print(
    f"Positive: {mix.get('pos', 0):.1f}%  "
    f"Neutral: {mix.get('neu', 0):.1f}%  "
    f"Negative: {mix.get('neg', 0):.1f}%"
)





>>>>>>>>>Overall Platform Tone<<<<<<<<<<<<< 
Docs analyzed: 7107
Average sentiment (compound): 0.407
Positive: 74.9%  Neutral: 7.7%  Negative: 17.3%


# Task 4.3

(on the word Homework coursebook)

# Task 4.4

(on the word Homework coursebook)