In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import os
import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype

In [32]:
import os
import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype

BASE = "/content/drive/MyDrive/Pushshift"

# фінальні файли
clean_path = f"{BASE}/final_dataset_clean_all_v2.csv"
small_path = f"{BASE}/final_dataset_small_all_v2.csv"

topics = [
    "anxiety",
    "depression",
    "offmychest",
    "covid19",
    "coronavirus",
    "covid19_support",
    "mentalhealthsupport"
]

clean_path, small_path

('/content/drive/MyDrive/Pushshift/final_dataset_clean_all_v2.csv',
 '/content/drive/MyDrive/Pushshift/final_dataset_small_all_v2.csv')

In [33]:
def clean_chunk_all(chunk: pd.DataFrame) -> pd.DataFrame:
    # 1. created_utc → datetime
    if 'created_utc' not in chunk.columns:
        # немає часу — такий шматок нам не допоможе
        return pd.DataFrame()

    # якщо timestamp числовий
    if is_integer_dtype(chunk['created_utc']) or is_float_dtype(chunk['created_utc']):
        chunk['created_utc'] = pd.to_datetime(
            chunk['created_utc'],
            unit='s',
            utc=True,
            errors='coerce'
        )
    else:
        # якщо вже str
        chunk['created_utc'] = pd.to_datetime(
            chunk['created_utc'],
            utc=True,
            errors='coerce'
        )

    # прибираємо рядки без дати
    chunk = chunk.dropna(subset=['created_utc'])
    if chunk.empty:
        return pd.DataFrame()

    # 2. month
    chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)

    # 3. вибір текстової колонки
    text_col = None
    for c in ['body', 'selftext', 'text']:
        if c in chunk.columns:
            text_col = c
            break

    if text_col is None:
        return pd.DataFrame()

    chunk['text'] = chunk[text_col].astype('string')
    chunk['text_len'] = chunk['text'].str.len()

    # 4. score → Int64
    if 'score' in chunk.columns:
        chunk['score'] = pd.to_numeric(chunk['score'], errors='coerce')
    else:
        chunk['score'] = pd.NA
    chunk['score'] = chunk['score'].astype('Int64')

    # 5. subreddit, author, id — як string
    if 'subreddit' in chunk.columns:
        chunk['subreddit'] = chunk['subreddit'].astype('string')
    if 'author' in chunk.columns:
        chunk['author'] = chunk['author'].astype('string')
    if 'id' in chunk.columns:
        chunk['id'] = chunk['id'].astype('string')

    # 6. залишаємо потрібні колонки
    keep = ['id', 'author', 'subreddit', 'created_utc',
            'month', 'text', 'text_len', 'score']

    existing = [c for c in keep if c in chunk.columns]
    chunk = chunk[existing]

    return chunk

In [34]:
for p in [clean_path, small_path]:
    if os.path.exists(p):
        os.remove(p)
        print("Removed:", p)

In [36]:
chunksize = 200_000
first_clean = True
first_small = True

total_in = 0
total_out = 0
total_small = 0

for topic in topics:
    topic_dir = os.path.join(BASE, topic)
    if not os.path.isdir(topic_dir):
        print(f"Directory not found: {topic_dir}, skipping")
        continue

    print(f"\n Topic: {topic}")
    files = sorted(f for f in os.listdir(topic_dir) if f.endswith(".csv"))
    print(f"Found {len(files)} files in {topic_dir}")

    for fname in files:
        fpath = os.path.join(topic_dir, fname)
        print(f"  Processing file: {fname}")

        reader = pd.read_csv(
            fpath,
            chunksize=chunksize,
            engine="python",
            on_bad_lines="skip"
        )

        for i, raw_chunk in enumerate(reader):
            total_in += len(raw_chunk)

            cleaned = clean_chunk_all(raw_chunk)
            if cleaned.empty:
                print(f"    Chunk {i}: empty after cleaning, skipped")
                continue

            total_out += len(cleaned)


            mode = "w" if first_clean else "a"
            cleaned.to_csv(clean_path, mode=mode, header=first_clean, index=False)
            first_clean = False


            sample_frac = 0.05
            sample = cleaned.sample(frac=sample_frac, random_state=42) if len(cleaned) > 0 else cleaned
            if not sample.empty:
                mode_s = "w" if first_small else "a"
                sample.to_csv(small_path, mode=mode_s, header=first_small, index=False)
                first_small = False
                total_small += len(sample)

            print(f"    Chunk {i}: raw={len(raw_chunk)}, cleaned={len(cleaned)}, total_cleaned={total_out}, total_small={total_small}")

print("Total raw rows seen:", total_in)
print("Total cleaned rows:", total_out)
print("Total sample rows:", total_small)
print("Clean file saved to:", clean_path)
print("Small file saved to:", small_path)


 Topic: anxiety
Found 8 files in /content/drive/MyDrive/Pushshift/anxiety
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=26402, cleaned=26402, total_cleaned=26402, total_small=1320
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=44439, cleaned=44439, total_cleaned=70841, total_small=3542
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=41104, cleaned=41104, total_cleaned=111945, total_small=5597
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=36243, cleaned=36243, total_cleaned=148188, total_small=7409
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=33293, cleaned=33293, total_cleaned=181481, total_small=9074
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=34120, cleaned=34120, total_cleaned=215601, total_small=10780
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=31716, cleaned=31716, total_cleaned=247317, total_small=12366
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=29077, cleaned=29077, total_cleaned=276394, total_small=13820

 Topic: depression
Found 8 files in /content/drive/MyDrive/Pushshift/depression
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=69660, cleaned=69660, total_cleaned=346054, total_small=17303
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=64987, cleaned=64987, total_cleaned=411041, total_small=20552
  Processing file: RC_2020-04 (1).csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=60620, cleaned=60620, total_cleaned=471661, total_small=23583
  Processing file: RC_2020-05 (1).csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=61420, cleaned=61420, total_cleaned=533081, total_small=26654
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=64539, cleaned=64539, total_cleaned=597620, total_small=29881
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=64212, cleaned=64212, total_cleaned=661832, total_small=33092
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=55660, cleaned=55660, total_cleaned=717492, total_small=35875
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=50989, cleaned=50989, total_cleaned=768481, total_small=38424

 Topic: offmychest
Found 8 files in /content/drive/MyDrive/Pushshift/offmychest
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=52547, cleaned=52547, total_cleaned=821028, total_small=41051
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=68701, cleaned=68701, total_cleaned=889729, total_small=44486
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=76495, cleaned=76495, total_cleaned=966224, total_small=48311
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=76844, cleaned=76844, total_cleaned=1043068, total_small=52153
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=77700, cleaned=77700, total_cleaned=1120768, total_small=56038
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=83361, cleaned=83361, total_cleaned=1204129, total_small=60206
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=72687, cleaned=72687, total_cleaned=1276816, total_small=63840
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=74295, cleaned=74295, total_cleaned=1351111, total_small=67555

 Topic: covid19
Found 8 files in /content/drive/MyDrive/Pushshift/covid19
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=14680, cleaned=14680, total_cleaned=1365791, total_small=68289
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=85714, cleaned=85714, total_cleaned=1451505, total_small=72575
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=104406, cleaned=104406, total_cleaned=1555911, total_small=77795
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=55501, cleaned=55501, total_cleaned=1611412, total_small=80570
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=23510, cleaned=23510, total_cleaned=1634922, total_small=81746
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=24971, cleaned=24971, total_cleaned=1659893, total_small=82995
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=17781, cleaned=17781, total_cleaned=1677674, total_small=83884
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=15145, cleaned=15145, total_cleaned=1692819, total_small=84641

 Topic: coronavirus
Found 7 files in /content/drive/MyDrive/Pushshift/coronavirus
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=1892819, total_small=94641


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=2092819, total_small=104641


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=77761, cleaned=77761, total_cleaned=2170580, total_small=108529
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=2370580, total_small=118529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=2570580, total_small=128529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=200000, cleaned=200000, total_cleaned=2770580, total_small=138529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 3: raw=200000, cleaned=200000, total_cleaned=2970580, total_small=148529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 4: raw=200000, cleaned=200000, total_cleaned=3170580, total_small=158529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 5: raw=200000, cleaned=200000, total_cleaned=3370580, total_small=168529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 6: raw=200000, cleaned=200000, total_cleaned=3570580, total_small=178529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 7: raw=200000, cleaned=200000, total_cleaned=3770580, total_small=188529


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 8: raw=127820, cleaned=127820, total_cleaned=3898400, total_small=194920
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=4098400, total_small=204920


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=4298400, total_small=214920


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=200000, cleaned=200000, total_cleaned=4498400, total_small=224920


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 3: raw=200000, cleaned=200000, total_cleaned=4698400, total_small=234920


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 4: raw=115366, cleaned=115366, total_cleaned=4813766, total_small=240688
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=5013766, total_small=250688


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=5213766, total_small=260688


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=157152, cleaned=157152, total_cleaned=5370918, total_small=268546
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=5570918, total_small=278546


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=5770918, total_small=288546


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=200000, cleaned=200000, total_cleaned=5970918, total_small=298546


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 3: raw=199262, cleaned=199262, total_cleaned=6170180, total_small=308509
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=6370180, total_small=318509


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=200000, cleaned=200000, total_cleaned=6570180, total_small=328509


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 2: raw=62256, cleaned=62256, total_cleaned=6632436, total_small=331622
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=200000, cleaned=200000, total_cleaned=6832436, total_small=341622


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 1: raw=108285, cleaned=108285, total_cleaned=6940721, total_small=347036

 Topic: covid19_support
Found 8 files in /content/drive/MyDrive/Pushshift/covid19_support
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=366, cleaned=366, total_cleaned=6941087, total_small=347054
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=20382, cleaned=20382, total_cleaned=6961469, total_small=348073
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=13959, cleaned=13959, total_cleaned=6975428, total_small=348771
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=8717, cleaned=8717, total_cleaned=6984145, total_small=349207
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=5563, cleaned=5563, total_cleaned=6989708, total_small=349485
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=7727, cleaned=7727, total_cleaned=6997435, total_small=349871
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=7363, cleaned=7363, total_cleaned=7004798, total_small=350239
  Processing file: RC_2020-09.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=6759, cleaned=6759, total_cleaned=7011557, total_small=350577

 Topic: mentalhealthsupport
Found 8 files in /content/drive/MyDrive/Pushshift/mentalhealthsupport
  Processing file: RC_2020-02.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=514, cleaned=514, total_cleaned=7012071, total_small=350603
  Processing file: RC_2020-03.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=640, cleaned=640, total_cleaned=7012711, total_small=350635
  Processing file: RC_2020-04.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=867, cleaned=867, total_cleaned=7013578, total_small=350678
  Processing file: RC_2020-05.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=895, cleaned=895, total_cleaned=7014473, total_small=350723
  Processing file: RC_2020-06.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=1110, cleaned=1110, total_cleaned=7015583, total_small=350779
  Processing file: RC_2020-07.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=1206, cleaned=1206, total_cleaned=7016789, total_small=350839
  Processing file: RC_2020-08.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


    Chunk 0: raw=1232, cleaned=1232, total_cleaned=7018021, total_small=350901
  Processing file: RC_2020-09.csv
    Chunk 0: raw=1295, cleaned=1295, total_cleaned=7019316, total_small=350966
Total raw rows seen: 7019316
Total cleaned rows: 7019316
Total sample rows: 350966
Clean file saved to: /content/drive/MyDrive/Pushshift/final_dataset_clean_all_v2.csv
Small file saved to: /content/drive/MyDrive/Pushshift/final_dataset_small_all_v2.csv


  chunk['month'] = chunk['created_utc'].dt.to_period('M').astype(str)


In [37]:
df_small = pd.read_csv(small_path)

df_small['created_utc'] = pd.to_datetime(df_small['created_utc'], utc=True, errors='coerce')
df_small['month'] = df_small['created_utc'].dt.to_period('M').astype(str)

print("Shape:", df_small.shape)
print("\nMonths:")
print(df_small['month'].value_counts().sort_index())

print("\nSubreddits:")
print(df_small['subreddit'].value_counts())

  df_small['month'] = df_small['created_utc'].dt.to_period('M').astype(str)


Shape: (350966, 8)

Months:
month
2020-02     32096
2020-03     14243
2020-04    101263
2020-05     57749
2020-06     38145
2020-07     50743
2020-08     32435
2020-09     24292
Name: count, dtype: int64

Subreddits:
subreddit
Coronavirus            262395
offmychest              29131
depression              24604
COVID19                 17086
Anxiety                 13820
COVID19_support          3541
MentalHealthSupport       389
Name: count, dtype: int64
