#**Computational Analysis of Academic Use of ChatGPT**
This notebook explores patterns in Reddit discussions related to academic use of ChatGPT using computational text analysis methods.

The analysis focuses on:
- Keyword-based data collection
- Text preprocessing
- Topic modelling using BERTopic

The goal is to examine recurring themes and structural patterns in discussions, rather than to make normative or causal claims.

# **Importing Libraries**

In [None]:
!pip -q install bertopic sentence-transformers umap-learn hdbscan scikit-learn
!pip install praw
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import praw
from datetime import datetime
import pandas as pd

# **Fetching Posts Based on Selected Keywords and Date Window**

In [None]:
keywords = [
    "essay",
    "assignment",
    "homework",
    "exam",
    "grading",
    "plagiarism",
    "cheating",
    "student",
    "university",
    "professor",
    "ai detection",
    "academic integrity"
]

target_total = 6500
data = []
seen_ids = set()

subreddit = reddit.subreddit("ChatGPT")

start_date = datetime(2025, 7, 1)
end_date   = datetime(2025, 12, 31)

for keyword in keywords:
    if len(data) >= target_total:
        break

    for submission in subreddit.search(
        query=keyword,
        sort="new",
        time_filter="all",
        limit=400
    ):
        if len(data) >= target_total:
            break

        if submission.id in seen_ids:
            continue

        created_time = datetime.fromtimestamp(submission.created_utc)
        if not (start_date <= created_time <= end_date):
            continue

        text = f"{submission.title} {submission.selftext}".strip()
        if text == "":
            continue

        data.append({
            "post_id": submission.id,
            "title": submission.title,
            "selftext": submission.selftext,
            "created_utc": submission.created_utc,
        })


        seen_ids.add(submission.id)

df = pd.DataFrame(data).drop_duplicates(subset="post_id")
print(f"Total Collected Posts {len(df)}")

df.head()


# **Preprocessing and BERTopic Modelling**

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_model = df.copy()
df_model["clean_text"] = df_model["text"].apply(clean)
df_model = df_model[df_model["clean_text"].str.len() >= 30].reset_index(drop=True)
docs = df_model["clean_text"].tolist()

print(f"Cleaned Posts: {len(docs)}")


vectorizer = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=5
)

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(docs, show_progress_bar=True)

topic_model = BERTopic(
    language="english",
    vectorizer_model=vectorizer,
    nr_topics="auto",
    verbose=True
)

topics_raw, _ = topic_model.fit_transform(docs, embeddings)
df_model["topic_raw"] = topics_raw


TARGET_TOPICS = 12

topic_model.reduce_topics(
    docs,
    nr_topics=TARGET_TOPICS
)

df_model["topic"] = topic_model.topics_
df_model.to_csv(
    "Posts_With_Topics.csv",
    index=False,
    encoding="utf-8-sig"
)


print("File Saved")




# **Visualisation**

In [None]:
topic_model.visualize_barchart(top_n_topics=12)