In [1]:
import os
import pandas as pd
import random

from tqdm import tqdm
import re

In [2]:
output_folder = "../dataset/twitter100m_tweets_filtered/data"
os.makedirs(output_folder, exist_ok=True)

In [3]:

keywords = [
   
    "climatechange", "climatechangeisreal", "actonclimate", "globalwarming",
    "savetheplanet", "climatejustice", "climateaction", "climatecrisis", "climatemarch",
    "renewableenergy", "fossilfree", "netzero", "sustainablefuture", "climatepolicy",
    "greennewdeal", "environmentaljustice", "keepitintheground", "climateemergency",
    
   
    "climatechangehoax", "climatedeniers", "climatechangeisfalse", "globalwarminghoax",
    "climatechangenotreal", "climatealarmism", "climatescammers", "globalcooling",
    "carbonhoax", "climatescam", "greenhoax", "warmingmyth", "fakenewsclimate",
    
    
    "greenhouse gases", "carbon footprint", "extreme weather", "sea level rise",
    "melting ice caps", "global temperature", "carbon dioxide emissions", "Paris Agreement",
    "IPCC report", "biodiversity loss", "deforestation", "renewable resources",
    "climate adaptation", "climate mitigation", "ecological disaster", "climate economy",
    
   
    "hurricane", "wildfires", "drought", "floods", "polar vortex", "heatwave",
]


pattern = re.compile(r"\b(" + "|".join(keywords) + r")\b", flags=re.IGNORECASE)


def filter_relevant_tweets(tweet):
    if pattern.search(tweet):
        return True
    return False



In [5]:

files = [file for file in os.listdir(output_folder) if file.endswith(".parquet")]


for file in tqdm(files, desc="Processing Parquet Files", ncols=100, bar_format="{l_bar}{bar} [Elapsed: {elapsed} | Remaining: {remaining}]"):
    file_path = os.path.join(output_folder, file)
    
    
    df = pd.read_parquet(file_path)
    
   
    relevant_tweets = df[df['tweet'].apply(filter_relevant_tweets)]
    
   
    print(f"\nFile: {file} | Total tweets: {len(df)} | Relevant tweets: {len(relevant_tweets)}")
    
    
    relevant_file_path = os.path.join(output_folder, f"relevant_{file}")
    relevant_tweets.to_parquet(relevant_file_path, index=False)
    print(f"Relevant tweets saved: {relevant_file_path}")

Processing Parquet Files:   2%|▌                                 [Elapsed: 00:38 | Remaining: 36:33]


File: filtered_train-00000-of-00041-3f49db2da17edd5a.parquet | Total tweets: 1453627 | Relevant tweets: 3053
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00000-of-00041-3f49db2da17edd5a.parquet


Processing Parquet Files:   3%|█▏                                [Elapsed: 01:16 | Remaining: 35:46]


File: filtered_train-00001-of-00041-4491d6daeecba187.parquet | Total tweets: 1477237 | Relevant tweets: 2227
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00001-of-00041-4491d6daeecba187.parquet


Processing Parquet Files:   5%|█▋                                [Elapsed: 01:56 | Remaining: 35:37]


File: filtered_train-00002-of-00041-6f8b4e61b330413a.parquet | Total tweets: 1491880 | Relevant tweets: 2345
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00002-of-00041-6f8b4e61b330413a.parquet


Processing Parquet Files:   7%|██▎                               [Elapsed: 02:35 | Remaining: 35:08]


File: filtered_train-00003-of-00041-0697f448acaad204.parquet | Total tweets: 1487113 | Relevant tweets: 2076
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00003-of-00041-0697f448acaad204.parquet


Processing Parquet Files:   9%|██▊                               [Elapsed: 03:16 | Remaining: 34:58]


File: filtered_train-00004-of-00041-93670d3f00e7702a.parquet | Total tweets: 1498448 | Relevant tweets: 2302
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00004-of-00041-93670d3f00e7702a.parquet


Processing Parquet Files:  10%|███▍                              [Elapsed: 03:53 | Remaining: 33:42]


File: filtered_train-00005-of-00041-bd0a45674af16df9.parquet | Total tweets: 1446150 | Relevant tweets: 2277
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00005-of-00041-bd0a45674af16df9.parquet


Processing Parquet Files:  12%|███▉                              [Elapsed: 04:33 | Remaining: 33:13]


File: filtered_train-00006-of-00041-1eba3fa272d4cc5a.parquet | Total tweets: 1494555 | Relevant tweets: 2143
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00006-of-00041-1eba3fa272d4cc5a.parquet


Processing Parquet Files:  14%|████▌                             [Elapsed: 05:10 | Remaining: 32:01]


File: filtered_train-00007-of-00041-83618ac42672cdb2.parquet | Total tweets: 1464380 | Relevant tweets: 2440
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00007-of-00041-83618ac42672cdb2.parquet


Processing Parquet Files:  16%|█████                             [Elapsed: 05:47 | Remaining: 31:00]


File: filtered_train-00008-of-00041-13bc872663c28c2c.parquet | Total tweets: 1462986 | Relevant tweets: 2403
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00008-of-00041-13bc872663c28c2c.parquet


Processing Parquet Files:  17%|█████▋                            [Elapsed: 06:24 | Remaining: 30:14]


File: filtered_train-00009-of-00041-832bcbc6df883d94.parquet | Total tweets: 1446194 | Relevant tweets: 1830
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00009-of-00041-832bcbc6df883d94.parquet


Processing Parquet Files:  19%|██████▎                           [Elapsed: 07:00 | Remaining: 29:12]


File: filtered_train-00010-of-00041-39a461f69a92fa95.parquet | Total tweets: 1454482 | Relevant tweets: 2171
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00010-of-00041-39a461f69a92fa95.parquet


Processing Parquet Files:  21%|██████▊                           [Elapsed: 07:36 | Remaining: 28:17]


File: filtered_train-00011-of-00041-d348ab50c204b5fe.parquet | Total tweets: 1466034 | Relevant tweets: 3365
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00011-of-00041-d348ab50c204b5fe.parquet


Processing Parquet Files:  22%|███████▍                          [Elapsed: 08:13 | Remaining: 27:38]


File: filtered_train-00012-of-00041-6e54d5bc32e33932.parquet | Total tweets: 1507293 | Relevant tweets: 2257
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00012-of-00041-6e54d5bc32e33932.parquet


Processing Parquet Files:  24%|███████▉                          [Elapsed: 08:50 | Remaining: 27:01]


File: filtered_train-00013-of-00041-0d2774f64ea97a49.parquet | Total tweets: 1489580 | Relevant tweets: 2499
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00013-of-00041-0d2774f64ea97a49.parquet


Processing Parquet Files:  26%|████████▌                         [Elapsed: 09:26 | Remaining: 26:16]


File: filtered_train-00014-of-00041-f7aba38896b21cac.parquet | Total tweets: 1450605 | Relevant tweets: 3064
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00014-of-00041-f7aba38896b21cac.parquet


Processing Parquet Files:  28%|█████████                         [Elapsed: 10:03 | Remaining: 25:37]


File: filtered_train-00015-of-00041-74ec967e210e13de.parquet | Total tweets: 1449995 | Relevant tweets: 2922
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00015-of-00041-74ec967e210e13de.parquet


Processing Parquet Files:  29%|█████████▋                        [Elapsed: 10:38 | Remaining: 24:50]


File: filtered_train-00016-of-00041-98ae0cf4e8a10ffa.parquet | Total tweets: 1449255 | Relevant tweets: 2210
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00016-of-00041-98ae0cf4e8a10ffa.parquet


Processing Parquet Files:  31%|██████████▏                       [Elapsed: 11:14 | Remaining: 24:04]


File: filtered_train-00017-of-00041-a3582ea51ad90339.parquet | Total tweets: 1441643 | Relevant tweets: 2449
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00017-of-00041-a3582ea51ad90339.parquet


Processing Parquet Files:  33%|██████████▊                       [Elapsed: 11:51 | Remaining: 23:43]


File: filtered_train-00018-of-00041-64af2b13d642caac.parquet | Total tweets: 1482168 | Relevant tweets: 2548
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00018-of-00041-64af2b13d642caac.parquet


Processing Parquet Files:  34%|███████████▍                      [Elapsed: 12:27 | Remaining: 22:54]


File: filtered_train-00019-of-00041-294207d1b5c8e9e4.parquet | Total tweets: 1445759 | Relevant tweets: 2268
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00019-of-00041-294207d1b5c8e9e4.parquet


Processing Parquet Files:  36%|███████████▉                      [Elapsed: 13:07 | Remaining: 23:00]


File: filtered_train-00020-of-00041-cb49f7a05f4c5137.parquet | Total tweets: 1492458 | Relevant tweets: 3958
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00020-of-00041-cb49f7a05f4c5137.parquet


Processing Parquet Files:  38%|████████████▌                     [Elapsed: 13:45 | Remaining: 22:30]


File: filtered_train-00021-of-00041-8349c19216ba168c.parquet | Total tweets: 1461163 | Relevant tweets: 2606
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00021-of-00041-8349c19216ba168c.parquet


Processing Parquet Files:  40%|█████████████                     [Elapsed: 14:23 | Remaining: 22:04]


File: filtered_train-00022-of-00041-35bee338f1d42d15.parquet | Total tweets: 1474783 | Relevant tweets: 2870
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00022-of-00041-35bee338f1d42d15.parquet


Processing Parquet Files:  41%|█████████████▋                    [Elapsed: 15:00 | Remaining: 21:18]


File: filtered_train-00023-of-00041-51d644a4e2bc84cc.parquet | Total tweets: 1433393 | Relevant tweets: 2275
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00023-of-00041-51d644a4e2bc84cc.parquet


Processing Parquet Files:  43%|██████████████▏                   [Elapsed: 15:38 | Remaining: 20:47]


File: filtered_train-00024-of-00041-dc45f0762d8ed082.parquet | Total tweets: 1458181 | Relevant tweets: 2916
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00024-of-00041-dc45f0762d8ed082.parquet


Processing Parquet Files:  45%|██████████████▊                   [Elapsed: 16:17 | Remaining: 20:16]


File: filtered_train-00025-of-00041-e40d409e31b7a7fd.parquet | Total tweets: 1479044 | Relevant tweets: 2254
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00025-of-00041-e40d409e31b7a7fd.parquet


Processing Parquet Files:  47%|███████████████▎                  [Elapsed: 16:56 | Remaining: 19:52]


File: filtered_train-00026-of-00041-5accc8f1a8490c8c.parquet | Total tweets: 1493491 | Relevant tweets: 2152
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00026-of-00041-5accc8f1a8490c8c.parquet


Processing Parquet Files:  48%|███████████████▉                  [Elapsed: 17:32 | Remaining: 18:50]


File: filtered_train-00027-of-00041-fe8fe3a7157e75a9.parquet | Total tweets: 1446775 | Relevant tweets: 3073
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00027-of-00041-fe8fe3a7157e75a9.parquet


Processing Parquet Files:  50%|████████████████▌                 [Elapsed: 18:11 | Remaining: 18:20]


File: filtered_train-00028-of-00041-97cccb9f87e490bf.parquet | Total tweets: 1463477 | Relevant tweets: 1940
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00028-of-00041-97cccb9f87e490bf.parquet


Processing Parquet Files:  52%|█████████████████                 [Elapsed: 18:47 | Remaining: 17:24]


File: filtered_train-00029-of-00041-4f5d24be1d2cba2f.parquet | Total tweets: 1426950 | Relevant tweets: 2384
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00029-of-00041-4f5d24be1d2cba2f.parquet


Processing Parquet Files:  53%|█████████████████▋                [Elapsed: 19:24 | Remaining: 16:45]


File: filtered_train-00030-of-00041-9712df7d5c3d07f0.parquet | Total tweets: 1491851 | Relevant tweets: 2401
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00030-of-00041-9712df7d5c3d07f0.parquet


Processing Parquet Files:  55%|██████████████████▏               [Elapsed: 20:01 | Remaining: 16:09]


File: filtered_train-00031-of-00041-14095d507331f9db.parquet | Total tweets: 1472817 | Relevant tweets: 2876
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00031-of-00041-14095d507331f9db.parquet


Processing Parquet Files:  57%|██████████████████▊               [Elapsed: 20:39 | Remaining: 15:34]


File: filtered_train-00032-of-00041-c948e5d06f7cc0d5.parquet | Total tweets: 1474152 | Relevant tweets: 3222
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00032-of-00041-c948e5d06f7cc0d5.parquet


Processing Parquet Files:  59%|███████████████████▎              [Elapsed: 21:15 | Remaining: 14:49]


File: filtered_train-00033-of-00041-0ddd38722e42faea.parquet | Total tweets: 1468271 | Relevant tweets: 3116
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00033-of-00041-0ddd38722e42faea.parquet


Processing Parquet Files:  60%|███████████████████▉              [Elapsed: 21:51 | Remaining: 14:06]


File: filtered_train-00034-of-00041-261953080646e6a9.parquet | Total tweets: 1435950 | Relevant tweets: 2428
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00034-of-00041-261953080646e6a9.parquet


Processing Parquet Files:  62%|████████████████████▍             [Elapsed: 22:28 | Remaining: 13:28]


File: filtered_train-00035-of-00041-811009fd57e6e0d0.parquet | Total tweets: 1453873 | Relevant tweets: 2138
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00035-of-00041-811009fd57e6e0d0.parquet


Processing Parquet Files:  64%|█████████████████████             [Elapsed: 23:04 | Remaining: 12:50]


File: filtered_train-00036-of-00041-98ebfbfe9826272d.parquet | Total tweets: 1469448 | Relevant tweets: 2872
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00036-of-00041-98ebfbfe9826272d.parquet


Processing Parquet Files:  66%|█████████████████████▌            [Elapsed: 23:42 | Remaining: 12:16]


File: filtered_train-00037-of-00041-02c67001499091d9.parquet | Total tweets: 1493047 | Relevant tweets: 2082
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00037-of-00041-02c67001499091d9.parquet


Processing Parquet Files:  67%|██████████████████████▏           [Elapsed: 24:19 | Remaining: 11:41]


File: filtered_train-00038-of-00041-08d9366060cda96a.parquet | Total tweets: 1461921 | Relevant tweets: 2507
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00038-of-00041-08d9366060cda96a.parquet


Processing Parquet Files:  69%|██████████████████████▊           [Elapsed: 24:56 | Remaining: 11:07]


File: filtered_train-00039-of-00041-d3dbd276c36d5b35.parquet | Total tweets: 1482133 | Relevant tweets: 3251
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00039-of-00041-d3dbd276c36d5b35.parquet


Processing Parquet Files:  71%|███████████████████████▎          [Elapsed: 25:32 | Remaining: 10:35]


File: filtered_train-00040-of-00041-9a723429a2a70e30.parquet | Total tweets: 1456145 | Relevant tweets: 2375
Relevant tweets saved: ../dataset/twitter100m_tweets_filtered/data\relevant_filtered_train-00040-of-00041-9a723429a2a70e30.parquet





FileNotFoundError: [Errno 2] No such file or directory: '../dataset/twitter100m_tweets_filtered/data\\processed_filtered_train-00000-of-00041-3f49db2da17edd5a.parquet'