In [3]:
!pip install praw
import praw
import pandas as pd

# 🔑 Your credentials
reddit = praw.Reddit(
    client_id="wzx3QK7ygFrKPSQeXhzeWQ",
    client_secret="_aZlEAeqVWeVpZTW7Rjr40YCXjIAhA",
    user_agent ="my user agent"
)

# Search EV-related posts (example: r/electricvehicles, r/cars, r/technology)
subreddits = ["electricvehicles", "cars", "technology"]
posts = []

for sub in subreddits:
    for submission in reddit.subreddit(sub).search("EV OR Electric Vehicle OR Tesla", limit=500):
        posts.append({
            "id": submission.id,
            "title": submission.title,
            "selftext": submission.selftext,
            "score": submission.score,
            "created_utc": submission.created_utc,
            "subreddit": submission.subreddit.display_name,
            "url": submission.url
        })

df = pd.DataFrame(posts)
df.to_csv("ev_reddit.csv", index=False)
print("Saved ev_reddit.csv with", len(df), "posts")




It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Saved ev_reddit.csv with 686 posts


In [10]:
%%writefile app_ev_reddit.py
# -*- coding: utf-8 -*-
"""
Reddit EV Text Analysis — Research Upgrade
- Live Reddit scraping (PRAW)
- Sentiment: VADER or Transformer (RoBERTa)
- Topics: LDA or BERTopic
- Wordcloud, time-trends, downloads
"""

import os, numpy as np, pandas as pd, streamlit as st, matplotlib.pyplot as plt
import praw
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# --- Optional, modern NLP ---
_HAVE_TRANSFORMERS = False
_HAVE_BERTOPIC = False
try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
    _HAVE_TRANSFORMERS = True
except Exception:
    _HAVE_TRANSFORMERS = False

try:
    from bertopic import BERTopic
    from sentence_transformers import SentenceTransformer
    _HAVE_BERTOPIC = True
except Exception:
    _HAVE_BERTOPIC = False

# Ensure VADER resource
try:
    nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    nltk.download("vader_lexicon")

st.set_page_config(page_title="Reddit EV Text Analysis (NLP+)", layout="wide")
st.title("🔋 Reddit Text Analysis: Electric Vehicles (EVs) — Research Upgrade")

# =========================
# Sidebar: API + Query
# =========================
st.sidebar.header("⚙️ Reddit API")
with st.sidebar.expander("Credentials", expanded=True):
    client_id     = st.text_input("Client ID")
    client_secret = st.text_input("Client Secret", type="password")
    user_agent    = st.text_input("User Agent", value="ev_analysis_app")

st.sidebar.header("🔎 Query")
subreddit_name = st.sidebar.text_input("Subreddit", value="electricvehicles")
search_query   = st.sidebar.text_input("Search Query", value="EV OR Tesla OR Electric Vehicle")
limit          = st.sidebar.slider("Number of posts", 50, 1000, 300, 50)
text_source    = st.sidebar.selectbox("Text to analyze", ["title", "selftext", "title + selftext"])

st.sidebar.header("🧠 Models")
sent_choice = st.sidebar.selectbox(
    "Sentiment model",
    ["Transformer (RoBERTa)", "VADER"],
    index=0 if _HAVE_TRANSFORMERS else 1,
    help="Falls back to VADER if Transformers not installed"
)
topic_choice = st.sidebar.selectbox(
    "Topic model",
    ["BERTopic", "LDA"],
    index=0 if _HAVE_BERTOPIC else 1,
    help="Falls back to LDA if BERTopic not installed"
)

fetch_btn = st.sidebar.button("🔎 Fetch Reddit Data")

# =========================
# Helpers
# =========================
@st.cache_data(show_spinner=False)
def fetch_reddit(client_id, client_secret, user_agent, subreddit, query, limit):
    reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
    sub = reddit.subreddit(subreddit)
    rows = []
    for s in sub.search(query, limit=limit):
        rows.append({
            "id": s.id,
            "subreddit": getattr(s.subreddit, "display_name", None),
            "title": s.title,
            "selftext": s.selftext,
            "score": int(getattr(s, "score", 0) or 0),
            "num_comments": int(getattr(s, "num_comments", 0) or 0),
            "created_utc": float(getattr(s, "created_utc", np.nan)),
            "permalink": f"https://www.reddit.com{s.permalink}" if getattr(s, "permalink", None) else None,
            "url": getattr(s, "url", None),
        })
    df = pd.DataFrame(rows)
    if "created_utc" in df.columns:
        df["created_dt"] = pd.to_datetime(df["created_utc"], unit="s", utc=True).dt.tz_convert("UTC")
        df["month"] = df["created_dt"].dt.to_period("M").dt.to_timestamp()
    return df

def compose_text(df: pd.DataFrame, source: str) -> pd.Series:
    if source == "title":
        return df["title"].fillna("")
    if source == "selftext":
        return df["selftext"].fillna("")
    return (df["title"].fillna("") + " " + df["selftext"].fillna("")).str.strip()

def clean_text(s: pd.Series) -> pd.Series:
    return (s.astype(str)
            .str.replace(r"http\S+|www\.\S+", " ", regex=True)
            .str.replace(r"&amp;|&lt;|&gt;", " ", regex=True)
            .str.replace(r"[^\w\s\-']", " ", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip())

# ---------- Sentiment ----------
@st.cache_resource(show_spinner=False)
def load_transformer_sentiment():
    # Lightweight, well-known model for general sentiment
    model_id = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    clf = pipeline("sentiment-analysis", model=model_id, tokenizer=model_id, truncation=True)
    return clf

def sentiment_vader(texts: list[str]) -> tuple[np.ndarray, list[str]]:
    sia = SentimentIntensityAnalyzer()
    scores = np.array([sia.polarity_scores(t)["compound"] for t in texts], dtype=float)
    labels = np.where(scores > 0.05, "positive", np.where(scores < -0.05, "negative", "neutral")).tolist()
    return scores, labels

def sentiment_transformer(texts: list[str]) -> tuple[np.ndarray, list[str]]:
    clf = load_transformer_sentiment()
    # batched inference to be safe
    preds, scores = [], []
    batch = 32
    for i in range(0, len(texts), batch):
        out = clf(texts[i:i+batch])
        for r in out:
            lab = r["label"].lower()
            # Map to [-1,1] like compound (rough heuristic)
            if "pos" in lab:
                preds.append("positive"); scores.append(0.7 if "neutral" not in lab else 0.0)
            elif "neu" in lab:
                preds.append("neutral"); scores.append(0.0)
            else:
                preds.append("negative"); scores.append(-0.7)
    return np.array(scores, dtype=float), preds

# ---------- Topics ----------
def topics_lda(texts: list[str], n_topics=6, max_features=4000):
    vec = CountVectorizer(stop_words="english", max_features=max_features)
    X = vec.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method="batch")
    lda.fit(X)
    vocab = vec.get_feature_names_out()
    top_terms = []
    for comp in lda.components_:
        idx = comp.argsort()[-12:]
        top_terms.append(", ".join(vocab[idx]))
    return top_terms

def topics_bertopic(texts: list[str], nr_topics=None):
    # sentence-transformers is auto-detected by BERTopic
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    topic_model = BERTopic(embedding_model=embedder, nr_topics=nr_topics, verbose=False)
    topics, _ = topic_model.fit_transform(texts)
    info = topic_model.get_topic_info()
    # get top terms for first N non-outlier topics
    terms = []
    for topic_id in info[info.Topic >= 0].Topic.head(10):
        words = [w for w, _ in topic_model.get_topic(topic_id)]
        terms.append(", ".join(words[:12]))
    return terms

# =========================
# Fetch
# =========================
df = None
if fetch_btn:
    if not (client_id and client_secret and user_agent):
        st.error("⚠️ Enter Reddit API credentials first.")
    else:
        with st.spinner("Fetching posts from Reddit..."):
            df = fetch_reddit(client_id, client_secret, user_agent, subreddit_name, search_query, limit)
        if df.empty:
            st.warning("No posts returned. Try a different query/subreddit.")
        else:
            st.success(f"✅ Retrieved {len(df)} posts from r/{subreddit_name}")
            st.dataframe(df.head(12), use_container_width=True)

# =========================
# Analysis
# =========================
if df is not None and not df.empty:
    # Prepare text
    df["text"] = clean_text(compose_text(df, text_source))
    df = df[df["text"].str.len() > 0].copy()

    # ----- Sentiment -----
    st.markdown("## 📊 Sentiment Analysis")
    use_transformer = (sent_choice.startswith("Transformer") and _HAVE_TRANSFORMERS)
    if sent_choice.startswith("Transformer") and not _HAVE_TRANSFORMERS:
        st.info("Transformers not installed — falling back to VADER.")
    with st.spinner("Scoring sentiment..."):
        if use_transformer:
            scores, labels = sentiment_transformer(df["text"].tolist())
        else:
            scores, labels = sentiment_vader(df["text"].tolist())
    df["sentiment"] = scores
    df["sent_label"] = labels

    c1, c2 = st.columns([1,2])
    with c1:
        st.write("**Class balance**")
        st.bar_chart(df["sent_label"].value_counts().reindex(["negative","neutral","positive"]).fillna(0))
    with c2:
        st.write("**Score distribution**")
        fig, ax = plt.subplots()
        ax.hist(df["sentiment"].values, bins=30)
        ax.set_xlabel("sentiment score"); ax.set_ylabel("count")
        st.pyplot(fig)

    if df["month"].notna().any():
        st.write("**Average sentiment per month**")
        monthly = df.dropna(subset=["month"]).groupby("month")["sentiment"].mean()
        st.line_chart(monthly)

    st.write("**Examples**")
    for _, r in df.sample(min(3, len(df)), random_state=42).iterrows():
        st.markdown(f"- **{r['sent_label'].capitalize()}** — {r['title'] or '[no title]'}")
        if r.get("permalink"):
            st.caption(r["permalink"])

    # ----- Wordcloud -----
    st.markdown("## ☁️ Word Cloud")
    text_blob = " ".join(df["text"].tolist())[:3_000_000]
    wc = WordCloud(width=1100, height=450, background_color="white").generate(text_blob)
    fig, ax = plt.subplots(figsize=(11,4))
    ax.imshow(wc, interpolation="bilinear"); ax.axis("off")
    st.pyplot(fig)

    # ----- Topics -----
    st.markdown("## 🧩 Topic Modeling")
    if topic_choice == "BERTopic" and not _HAVE_BERTOPIC:
        st.info("BERTopic not installed — falling back to LDA.")

    if topic_choice == "BERTopic" and _HAVE_BERTOPIC:
        nr_topics = st.slider("Number of topics (approx, BERTopic may merge)", 4, 20, 10, 1)
        with st.spinner("Fitting BERTopic (embeddings + clustering)..."):
            try:
                top_terms = topics_bertopic(df["text"].tolist(), nr_topics)
                st.write("**Top topics (first 10):**")
                for i, terms in enumerate(top_terms, 1):
                    st.markdown(f"- **Topic {i}:** {terms}")
            except Exception as e:
                st.warning(f"BERTopic failed: {e}. Falling back to LDA.")
                topic_choice = "LDA"  # force fallback
    if topic_choice == "LDA" or not _HAVE_BERTOPIC:
        n_topics     = st.slider("Number of topics (LDA)", 3, 12, 6, 1)
        max_features = st.slider("Max features (BoW)", 1000, 10000, 4000, 500)
        with st.spinner("Fitting LDA..."):
            try:
                top_terms = topics_lda(df["text"].tolist(), n_topics=n_topics, max_features=max_features)
                st.write("**Top terms per topic**")
                for i, terms in enumerate(top_terms, start=1):
                    st.markdown(f"- **Topic {i}:** {terms}")
            except Exception as e:
                st.error(f"LDA failed: {e}")

    # ----- Downloads -----
    st.markdown("## ⬇️ Save Results")
    enriched = df[[
        "id","subreddit","title","selftext","score","num_comments","created_utc","created_dt",
        "permalink","url","text","sentiment","sent_label"
    ]].copy()
    st.download_button("Download enriched CSV", enriched.to_csv(index=False), file_name="ev_reddit_enriched.csv")
    st.download_button("Download enriched JSON", enriched.to_json(orient="records"), file_name="ev_reddit_enriched.json")

else:
    st.info("Enter credentials, set your query, then click **Fetch Reddit Data** to begin.")


Overwriting app_ev_reddit.py


In [11]:
!pip -q install streamlit praw wordcloud scikit-learn nltk matplotlib pyngrok \
  transformers torch bertopic umap-learn hdbscan sentence-transformers

!pkill -f streamlit || true
!streamlit run app_ev_reddit.py --server.port 8501 &>/content/logs.txt &

from pyngrok import ngrok
ngrok.set_auth_token("2tZ6mqHFZ9n2B4HsTOzAPVA3Jnw_6qB1RFncPLxV8kcYUxNcJ")
print("🌍 Public URL:", ngrok.connect(8501, "http").public_url)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h^C
🌍 Public URL: https://64f8e6eabe3b.ngrok-free.app
