<a href="https://colab.research.google.com/github/ElisaMisu/KinkyBERT/blob/main/gwstories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Correct installation of necessary packages
!pip install zstandard bertopic sentence-transformers nltk

# Import necessary libraries
from google.colab import drive
import os
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import zstandard as zstd

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Mount Google Drive
drive.mount('/content/drive')

# Define the base directory and the target file
base_dir = '/content/drive/MyDrive/subreddits23'
target_file = 'gonewildstories_submissions.json'

# Ensure the target file is in the directory
if target_file not in os.listdir(base_dir):
    print(f"{target_file} not found in the directory.")
else:
    print(f"Found target file: {target_file}")

#define file path

file_path = os.path.join(base_dir, target_file)

# Set environment variable to disable parallelism in tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to decompress a .zst file and save it
#def decompress_and_save(file_path, output_path):
   # try:
    #    with open(file_path, 'rb') as compressed_file:
     #       dctx = zstd.ZstdDecompressor()
      #      with open(output_path, 'wb') as decompressed_file:
       #         dctx.copy_stream(compressed_file, decompressed_file)
        #print(f"Decompressed and saved: {output_path}")
        #return output_path
    #except Exception as e:
     #   print(f"Error processing {file_path}: {e}")
      #  return None

# Decompress the target file
#compressed_file_path = os.path.join(base_dir, target_file)
#ecompressed_file_path = os.path.join(base_dir, target_file.replace('.zst', '.json'))
#output_path = decompress_and_save(compressed_file_path, decompressed_file_path)

#if output_path is not None:
   # print(f"Decompressed file: {output_path}")




Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Downloading

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive
Found target file: gonewildstories_submissions.json


In [2]:
# Function to preprocess text
reddit_terms = set([
    'upvote', 'downvote', 'OP', 'TL;DR', 'tldr', 'edit', 'mods', 'moderator',
    'AMA', 'ask me anything', 'crosspost', 'x-post', 'nsfw', 'flair', 'flairs',
    'karma', 'subreddit', 'thread', 'username', 'usernames', 'selfpost', 'self-post',
    'comment', 'comments', 'post', 'posts', 'reply', 'replies', 'vote', 'votes', 'deleted', 'delete',
    'acct', 'expired', 'account', 'verified', 'reported', 'report', 'remove', 'removed',
    'modmail', 'combined', 'submissions'
])

def preprocess_text(text):
    if isinstance(text, str):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove special characters and digits
        text = re.sub(r'\@\w+|\#', '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Remove stopwords and common Reddit terms
        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split() if word not in stop_words and word not in reddit_terms)
        return text
    else:
        return ""

#Load the decompressed file into a DataFrame and preprocess
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]
    df = pd.DataFrame(data)
    if 'selftext' in df.columns:
        df['processed_text'] = df['selftext'].apply(lambda x: preprocess_text(x) if pd.notnull(x) else "")
        # Filter out empty processed texts
        df = df[df['processed_text'].apply(lambda x: isinstance(x, str) and x.strip() != '')]
        print(f"Processed DataFrame:")
        print(df[['processed_text']].head())

# BERTopic Analysis
if not df.empty:
    # Load the SentenceTransformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for the processed text
    embeddings = model.encode(df['processed_text'].tolist(), show_progress_bar=True)

    # Perform BERTopic analysis
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(df['processed_text'], embeddings)


    # Display the topics
    print(topic_model.get_topic_info())

 # Save the BERTopic model to Google Drive
    model_path = "/content/drive/MyDrive/bertopic_model"
    topic_model.save(model_path)
    print(f"BERTopic model saved successfully at {model_path}.")
else:
    print("No data available for BERTopic analysis.")




Processed DataFrame:
                                      processed_text
0  hi everyone thanks joining im looking forward ...
1  girlfriend vacation hotel pool area large saun...
2  im mostly straight guy maybe one closest bestl...
4  purposes story changed names involved protect ...
5  actually wasnt completely sleeping dozing earl...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1969 [00:00<?, ?it/s]



     Topic  Count                                   Name  \
0       -1  44705                 -1_back_like_time_cock   
1        0   2947                 0_cock_clit_feel_pussy   
2        1   1118                1_office_desk_work_boss   
3        2    838                  2_cum_love_clit_pussy   
4        3    717              3_beach_water_pool_bikini   
..     ...    ...                                    ...   
324    323     10  323_pathetic_bestfriend_kinda_friends   
325    324     10         324_beth_booth_hole_permission   
326    325     10       325_submissive_skye_eva_dominant   
327    326     10                  326_tess_xb_mike_slap   
328    327     10                 327_ann_linda_amy_gabe   

                                        Representation  \
0    [back, like, time, cock, one, could, said, got...   
1    [cock, clit, feel, pussy, cum, tongue, mouth, ...   
2    [office, desk, work, boss, job, working, day, ...   
3    [cum, love, clit, pussy, wet, dildo, orgas

In [3]:
# Visualize the topics and other visualizations after loading the model
topic_model.visualize_hierarchy(top_n_topics=30)
topic_model.visualize_heatmap()

In [4]:
# Further reduce topics
topic_model.reduce_topics(df['processed_text'].tolist(), nr_topics=30)

# Access updated topics
topics = topic_model.topics_

# Display the topics
print(topic_model.get_topic_info())


    Topic  Count                                      Name  \
0      -1  44705                    -1_back_cock_like_time   
1       0  14875                    0_back_cock_time_could   
2       1   2340                    1_got_started_cum_time   
3       2    311              2_stories_story_write_anyone   
4       3     85                  3_fucking_said_like_time   
5       4     69                   4_peach_miss_peachs_ass   
6       5     68                    5_feet_foot_toes_razor   
7       6     63                6_hack_turtles_ask_adfasdf   
8       7     45                 7_dm_hdjd_question_hottie   
9       8     44  8_contest_theme_monthly_rgonewildstories   
10      9     43                9_henry_henrys_sophie_nick   
11     10     42                      10_desk_love_toy_joi   
12     11     29               11_annalise_know_us_caitlin   
13     12     26                12_pee_piss_bladder_peeing   
14     13     25        13_george_friends_brother_remember   
15     1

In [5]:
# Define titles and reduced_embeddings
titles = df['selftext'].tolist()
reduced_embeddings = topic_model.embeddings_


# Visualize and save the new model
visualizations = {
    "Documents": topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings),
    "Topics": topic_model.visualize_topics(),
    "Hierarchy": topic_model.visualize_hierarchy(),
    "Barchart": topic_model.visualize_barchart(),
    "Heatmap": topic_model.visualize_heatmap(),
    "Term Rank": topic_model.visualize_term_rank(),
    "Distribution": topic_model.visualize_distribution(probs),  # Use probs instead of df['processed_text']
    "Hierarchy Top 5": topic_model.visualize_hierarchy(top_n_topics=5)
}

# Save the BERTopic model to Google Drive
model_path = "/content/drive/MyDrive/bertopic_model_reduced"
topic_model.save(model_path)
print(f"BERTopic model with reduced topics saved successfully at {model_path}.")

# Save and display visualizations
for name, viz in visualizations.items():
    html_path = f"/content/drive/MyDrive/{name}_visualization.html"
    viz.write_html(html_path)
    print(f"{name} visualization saved to {html_path}")

# Display visualizations in Colab
from IPython.display import IFrame, display

for name in visualizations.keys():
    html_path = f"/content/drive/MyDrive/{name}_visualization.html"
    display(IFrame(src=html_path, width=900, height=800))




BERTopic model with reduced topics saved successfully at /content/drive/MyDrive/bertopic_model_reduced.
Documents visualization saved to /content/drive/MyDrive/Documents_visualization.html
Topics visualization saved to /content/drive/MyDrive/Topics_visualization.html
Hierarchy visualization saved to /content/drive/MyDrive/Hierarchy_visualization.html
Barchart visualization saved to /content/drive/MyDrive/Barchart_visualization.html
Heatmap visualization saved to /content/drive/MyDrive/Heatmap_visualization.html
Term Rank visualization saved to /content/drive/MyDrive/Term Rank_visualization.html
Distribution visualization saved to /content/drive/MyDrive/Distribution_visualization.html
Hierarchy Top 5 visualization saved to /content/drive/MyDrive/Hierarchy Top 5_visualization.html
