In [3]:
from pathlib import Path
import sys

ROOT = Path.cwd()
if (ROOT / "src").exists():
    sys.path.insert(0, str(ROOT))
elif (ROOT.parent / "src").exists():
    sys.path.insert(0, str(ROOT.parent))
else:
    raise RuntimeError("Could not find 'src' folder near notebook.")

RAW_DIR = Path("data/raw")
PROC_DIR = Path("data/processed")
PROC_DIR.mkdir(parents=True, exist_ok=True)

print("Dirs ready:", RAW_DIR.resolve(), PROC_DIR.resolve())

Dirs ready: /Users/baderrezek/Desktop/Projects/Personal/sims4-sentiment-analysis/notebooks/data/raw /Users/baderrezek/Desktop/Projects/Personal/sims4-sentiment-analysis/notebooks/data/processed


In [None]:
import sys
from pathlib import Path

try:
    ROOT = Path(__file__).resolve().parent.parent
except NameError:
    ROOT = Path.cwd().parent

sys.path.append(str(ROOT))   # makes "src" importable

In [8]:
from pathlib import Path

RAW_DIR = Path(ROOT) / "data" / "raw"
DB_PATH = RAW_DIR / "sims4.db"

print("DB path:", DB_PATH)
print("Exists?", DB_PATH.exists())

DB path: /Users/baderrezek/Desktop/Projects/Personal/sims4-sentiment-analysis/data/raw/sims4.db
Exists? True


In [12]:
# Data Cleaning
import sqlite3
import pandas as pd

conn = sqlite3.connect(DB_PATH)

In [None]:
# Before
print("Posts Table:")
df_posts = pd.read_sql_query("SELECT * FROM posts", conn)
display(df_posts.head(5))

print("\n\nComments Table:")
df_comments = pd.read_sql_query("SELECT * FROM comments", conn)
display(df_comments.head(5))

Unnamed: 0,post_id,created_utc,date,author,title,body,score,num_comments,permalink,subreddit,mode
0,1mqbu8g,b'\xaaC\x9eh\x00\x00\x00\x00',2025-08-14 20:14:34,the-rain-witch,the cat is under the covers with her are you k...,,b'=D\x00\x00\x00\x00\x00\x00',b'\xb6\x00\x00\x00\x00\x00\x00\x00',https://reddit.com/r/Sims4/comments/1mqbu8g/th...,Sims4,top
1,1mv3jnz,b'{<\xa5h\x00\x00\x00\x00',2025-08-20 03:09:47,LPhamster,Welp. My 3 year old turned off my computer. Lo...,I finished making another cake house and I lov...,b'\x8f:\x00\x00\x00\x00\x00\x00',b'\xf4\x00\x00\x00\x00\x00\x00\x00',https://reddit.com/r/Sims4/comments/1mv3jnz/we...,Sims4,top
2,1mtxbiy,b'\x1c\x8b\xa3h\x00\x00\x00\x00',2025-08-18 20:20:44,NewInitiative9498,I love everything about this ridiculously big ...,I was about to sit my male Sim at the computer...,b'@2\x00\x00\x00\x00\x00\x00',b'\xbf\x00\x00\x00\x00\x00\x00\x00',https://reddit.com/r/Sims4/comments/1mtxbiy/i_...,Sims4,top
3,1n2l1np,b'p\xa7\xb0h\x00\x00\x00\x00',2025-08-28 19:01:04,LPhamster,Does this look like a strawberry roll shortcak...,I want to continue my streak of making little ...,b'\xaa0\x00\x00\x00\x00\x00\x00',b'\xd0\x00\x00\x00\x00\x00\x00\x00',https://reddit.com/r/Sims4/comments/1n2l1np/do...,Sims4,top
4,1mn9ghp,b'\x91\xc7\x99h\x00\x00\x00\x00',2025-08-11 10:36:01,VanessaCardui93,I got tired of tracking down fruits and veg so...,Apart from a few debug items everything is hom...,b'\xac.\x00\x00\x00\x00\x00\x00',b'F\x02\x00\x00\x00\x00\x00\x00',https://reddit.com/r/Sims4/comments/1mn9ghp/i_...,Sims4,top


In [35]:
import sys
from pathlib import Path
ROOT = Path.cwd().parent
sys.path.append(str(ROOT))
import importlib
import src.preprocess as p_d

importlib.reload(p_d)

import pandas as pd
from src.preprocess import (
    clean_text_series, tokenize_series,
    fix_bytes_series, created_utc_to_date_series,
    serialize_list_columns
)

In [20]:
# Cleaning Post Data

post_df = pd.read_sql_query("SELECT * FROM posts", conn)

# fixing byte issues
post_df["score"] = fix_bytes_series(post_df["score"])
post_df["num_comments"] = fix_bytes_series(post_df["num_comments"])

# created_utc -> date (UTC)
post_df["created_utc"] = fix_bytes_series(post_df["created_utc"])
post_df["date"] = created_utc_to_date_series(post_df["created_utc"])

# clean title/body without touching author/permalink
post_df["title_clean"] = clean_text_series(post_df["title"], keep_numbers=True, remove_urls=True)
post_df["body_clean"]  = clean_text_series(post_df["body"],  keep_numbers=True, remove_urls=True)

# optional tokens
post_df["title_tokens"] = tokenize_series(post_df["title"], keep_numbers=True, remove_urls=True)
post_df["body_tokens"]  = tokenize_series(post_df["body"],  keep_numbers=True, remove_urls=True)

# Printing posts
display(post_df.head(5))

Unnamed: 0,post_id,created_utc,date,author,title,body,score,num_comments,permalink,subreddit,mode,title_clean,body_clean,title_tokens,body_tokens
0,1mqbu8g,1755202474,2025-08-14 20:14:34+00:00,the-rain-witch,the cat is under the covers with her are you k...,,17469,182,https://reddit.com/r/Sims4/comments/1mqbu8g/th...,Sims4,top,cat cover kid,,"[cat, cover, kid]",[]
1,1mv3jnz,1755659387,2025-08-20 03:09:47+00:00,LPhamster,Welp. My 3 year old turned off my computer. Lo...,I finished making another cake house and I lov...,14991,244,https://reddit.com/r/Sims4/comments/1mv3jnz/we...,Sims4,top,welp 3 year old turn computer lose build look ...,finish make another cake house love forgot cli...,"[welp, 3, year, old, turn, computer, lose, bui...","[finish, make, another, cake, house, love, for..."
2,1mtxbiy,1755548444,2025-08-18 20:20:44+00:00,NewInitiative9498,I love everything about this ridiculously big ...,I was about to sit my male Sim at the computer...,12864,191,https://reddit.com/r/Sims4/comments/1mtxbiy/i_...,Sims4,top,love everything ridiculously big dress,sit male sim computer busy notice wife get sho...,"[love, everything, ridiculously, big, dress]","[sit, male, sim, computer, busy, notice, wife,..."
3,1n2l1np,1756407664,2025-08-28 19:01:04+00:00,LPhamster,Does this look like a strawberry roll shortcak...,I want to continue my streak of making little ...,12458,208,https://reddit.com/r/Sims4/comments/1n2l1np/do...,Sims4,top,look like strawberry roll shortcake house,want continue streak make little pastry house ...,"[look, like, strawberry, roll, shortcake, house]","[want, continue, streak, make, little, pastry,..."
4,1mn9ghp,1754908561,2025-08-11 10:36:01+00:00,VanessaCardui93,I got tired of tracking down fruits and veg so...,Apart from a few debug items everything is hom...,11948,582,https://reddit.com/r/Sims4/comments/1mn9ghp/i_...,Sims4,top,get tired track fruit veg make farmer market,apart debug item everything homegrown sim home...,"[get, tired, track, fruit, veg, make, farmer, ...","[apart, debug, item, everything, homegrown, si..."


In [22]:
# Cleaning Comment Data

comment_df = pd.read_sql_query("SELECT * FROM comments", conn)

# fixing byte issues
comment_df["score"] = fix_bytes_series(comment_df["score"])

# created_utc -> date (UTC)
comment_df["created_utc"] = fix_bytes_series(comment_df["created_utc"])
comment_df["date"] = created_utc_to_date_series(comment_df["created_utc"])

# clean title/body without touching author/permalink
comment_df["body_clean"]  = clean_text_series(comment_df["body"],  keep_numbers=True, remove_urls=True)

# optional tokens
comment_df["body_tokens"]  = tokenize_series(comment_df["body"],  keep_numbers=True, remove_urls=True)

# Printing comments
display(comment_df.head(5))

Unnamed: 0,comment_id,post_id,subreddit,created_utc,date,author,body,score,parent_permalink,body_clean,body_tokens
0,fz1sdeb,hwoao3,,1595557766,2020-07-24 02:29:26+00:00,vukette,Okay now can we bring back bonehilda for plumb...,1370,https://reddit.com/r/thesims/comments/hwoao3/t...,okay bring back bonehilda plumbella plz,"[okay, bring, back, bonehilda, plumbella, plz]"
1,fz0vjeu,hwoao3,,1595540042,2020-07-23 21:34:02+00:00,akerwoods,Justice for cowplants finally,495,https://reddit.com/r/thesims/comments/hwoao3/t...,justice cowplant finally,"[justice, cowplant, finally]"
2,fz0yakx,hwoao3,,1595541400,2020-07-23 21:56:40+00:00,catdadsimmer,this is literally one of the cutest and wholes...,616,https://reddit.com/r/thesims/comments/hwoao3/t...,literally one cut wholesome moment sim,"[literally, one, cut, wholesome, moment, sim]"
3,fz16el9,hwoao3,,1595545544,2020-07-23 23:05:44+00:00,yoop1001,aww i'm so happy for kelly 💗,351,https://reddit.com/r/thesims/comments/hwoao3/t...,aww happy kelly,"[aww, happy, kelly]"
4,fz182vo,hwoao3,,1595546413,2020-07-23 23:20:13+00:00,taikodragonqueen,Seeing her joy at this was so very much wholes...,251,https://reddit.com/r/thesims/comments/hwoao3/t...,see joy much wholesome good bean,"[see, joy, much, wholesome, good, bean]"


In [28]:
print(f"Posts:")
display(post_df.columns)
print(f"\nComments:")
display(comment_df.columns)

Posts:


Index(['post_id', 'created_utc', 'date', 'author', 'title', 'body', 'score',
       'num_comments', 'permalink', 'subreddit', 'mode', 'title_clean',
       'body_clean', 'title_tokens', 'body_tokens'],
      dtype='object')


Comments:


Index(['comment_id', 'post_id', 'subreddit', 'created_utc', 'date', 'author',
       'body', 'score', 'parent_permalink', 'body_clean', 'body_tokens'],
      dtype='object')

In [36]:
"""
Make Preprocessed df under data/preprocessed:
"""

PROC_DIR = ROOT / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

db_path = PROC_DIR / "sims4_cleaned.db"
conn = sqlite3.connect(db_path)

post_df_sql = serialize_list_columns(post_df)
comment_df_sql = serialize_list_columns(comment_df)

# Save tables (replace to overwrite if rerun)
post_df_sql.to_sql("posts", conn, if_exists="replace", index=False)
comment_df_sql.to_sql("comments", conn, if_exists="replace", index=False)

conn.commit()
conn.close()

print("Relational DB ready at:", db_path)

Relational DB ready at: /Users/baderrezek/Desktop/Projects/Personal/sims4-sentiment-analysis/data/processed/sims4_cleaned.db


In [39]:
# Print table heads from the relational DB

PROCESSED_DIR = Path(ROOT) / "data" / "processed"
DB_CLEANED_PATH = PROCESSED_DIR / "sims4_cleaned.db"

print("DB path:", DB_CLEANED_PATH)
print("Exists?", DB_CLEANED_PATH.exists())

conn2 = sqlite3.connect(DB_CLEANED_PATH)

cleaned_df_posts = pd.read_sql_query("SELECT * FROM posts", conn2)
display(cleaned_df_posts.head(5))

cleaned_df_comments = pd.read_sql_query("SELECT * FROM comments", conn2)
display(cleaned_df_posts.head(5))

DB path: /Users/baderrezek/Desktop/Projects/Personal/sims4-sentiment-analysis/data/processed/sims4_cleaned.db
Exists? True


Unnamed: 0,post_id,created_utc,date,author,title,body,score,num_comments,permalink,subreddit,mode,title_clean,body_clean,title_tokens,body_tokens
0,1mqbu8g,1755202474,2025-08-14 20:14:34+00:00,the-rain-witch,the cat is under the covers with her are you k...,,17469,182,https://reddit.com/r/Sims4/comments/1mqbu8g/th...,Sims4,top,cat cover kid,,cat cover kid,
1,1mv3jnz,1755659387,2025-08-20 03:09:47+00:00,LPhamster,Welp. My 3 year old turned off my computer. Lo...,I finished making another cake house and I lov...,14991,244,https://reddit.com/r/Sims4/comments/1mv3jnz/we...,Sims4,top,welp 3 year old turn computer lose build look ...,finish make another cake house love forgot cli...,welp 3 year old turn computer lose build look ...,finish make another cake house love forgot cli...
2,1mtxbiy,1755548444,2025-08-18 20:20:44+00:00,NewInitiative9498,I love everything about this ridiculously big ...,I was about to sit my male Sim at the computer...,12864,191,https://reddit.com/r/Sims4/comments/1mtxbiy/i_...,Sims4,top,love everything ridiculously big dress,sit male sim computer busy notice wife get sho...,love everything ridiculously big dress,sit male sim computer busy notice wife get sho...
3,1n2l1np,1756407664,2025-08-28 19:01:04+00:00,LPhamster,Does this look like a strawberry roll shortcak...,I want to continue my streak of making little ...,12458,208,https://reddit.com/r/Sims4/comments/1n2l1np/do...,Sims4,top,look like strawberry roll shortcake house,want continue streak make little pastry house ...,look like strawberry roll shortcake house,want continue streak make little pastry house ...
4,1mn9ghp,1754908561,2025-08-11 10:36:01+00:00,VanessaCardui93,I got tired of tracking down fruits and veg so...,Apart from a few debug items everything is hom...,11948,582,https://reddit.com/r/Sims4/comments/1mn9ghp/i_...,Sims4,top,get tired track fruit veg make farmer market,apart debug item everything homegrown sim home...,get tired track fruit veg make farmer market,apart debug item everything homegrown sim home...


Unnamed: 0,post_id,created_utc,date,author,title,body,score,num_comments,permalink,subreddit,mode,title_clean,body_clean,title_tokens,body_tokens
0,1mqbu8g,1755202474,2025-08-14 20:14:34+00:00,the-rain-witch,the cat is under the covers with her are you k...,,17469,182,https://reddit.com/r/Sims4/comments/1mqbu8g/th...,Sims4,top,cat cover kid,,cat cover kid,
1,1mv3jnz,1755659387,2025-08-20 03:09:47+00:00,LPhamster,Welp. My 3 year old turned off my computer. Lo...,I finished making another cake house and I lov...,14991,244,https://reddit.com/r/Sims4/comments/1mv3jnz/we...,Sims4,top,welp 3 year old turn computer lose build look ...,finish make another cake house love forgot cli...,welp 3 year old turn computer lose build look ...,finish make another cake house love forgot cli...
2,1mtxbiy,1755548444,2025-08-18 20:20:44+00:00,NewInitiative9498,I love everything about this ridiculously big ...,I was about to sit my male Sim at the computer...,12864,191,https://reddit.com/r/Sims4/comments/1mtxbiy/i_...,Sims4,top,love everything ridiculously big dress,sit male sim computer busy notice wife get sho...,love everything ridiculously big dress,sit male sim computer busy notice wife get sho...
3,1n2l1np,1756407664,2025-08-28 19:01:04+00:00,LPhamster,Does this look like a strawberry roll shortcak...,I want to continue my streak of making little ...,12458,208,https://reddit.com/r/Sims4/comments/1n2l1np/do...,Sims4,top,look like strawberry roll shortcake house,want continue streak make little pastry house ...,look like strawberry roll shortcake house,want continue streak make little pastry house ...
4,1mn9ghp,1754908561,2025-08-11 10:36:01+00:00,VanessaCardui93,I got tired of tracking down fruits and veg so...,Apart from a few debug items everything is hom...,11948,582,https://reddit.com/r/Sims4/comments/1mn9ghp/i_...,Sims4,top,get tired track fruit veg make farmer market,apart debug item everything homegrown sim home...,get tired track fruit veg make farmer market,apart debug item everything homegrown sim home...
