https://www.reddit.com/r/pushshift/comments/ajmcc0/comment/ef012vk/

In [11]:
import csv
import os
import requests
import ujson
import zstandard as zstd

# Define the base URL
url = "https://files.pushshift.io/reddit/submissions/"

# Define the range of years and months to download, the start_month is only used for the first year, after that it will start from January.
start_year = 2021
end_year = 2022
start_month = 12
end_month = 12

current_year = start_year

# Create a directory to store the dumpfiles
if not os.path.exists("dumpfiles"):
    os.makedirs("dumpfiles")
    

# Create the CSV file

with open('../all_raw_csv/trippinthroughtime_data_12_2021.csv', mode='w', newline='') as csv_file:
    fieldnames = ['score', "author", 'total_awards_received', 'created_utc', 'num_comments', 'selftext', 'title', 'url', "domain", "permalink", "id", "subreddit_subscribers","num_crossposts", "relative_path"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
        # Rest of the code as is

    # Set the total number of files to download
    total_files = (end_year - start_year + 1) * 12
    processed_files = 0
    # Iterate through the years and months
    for year in range(start_year, end_year+1):
        if current_year == year:
            month_start = start_month
        else:
            month_start = 1
        for month in range(month_start, end_month+1):
            # Create the filename
            filename = f"RS_{year}-{month:02d}.zst"
            file_url = url + filename
            # Check if the file already exists in the dumpfiles directory
            if os.path.exists(f"dumpfiles/{filename}"):
                print(f"{filename} already exists, skipping download.")
            else:
            # Build the request
                r = requests.get(file_url, stream=True)
                # Check if the request is successful
                if r.status_code != 200:
                    print(f"{filename} not found.")
                    continue
            # Write the file to disk
                with open(f"dumpfiles/{filename}", 'wb') as f:
                    file_size = int(r.headers.get("Content-Length", 0))
                    downloaded = 0
                    for data in r.iter_content(4096):
                        downloaded += len(data)
                        f.write(data)
                        done = int(50 * downloaded / file_size)
                        print(f"\r{filename} [{'=' * done}{' ' * (50-done)}] {downloaded/1048576:.2f}/{file_size/1048576:.2f} MB", end="")
                print(f"\n{filename} downloaded.")
            with open(f"dumpfiles/{filename}", 'rb') as fh:
                dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
                with dctx.stream_reader(fh) as reader:
                    previous_line = ""
                    while True:
                        chunk = reader.read(2**24)  # 16mb chunks
                        if not chunk:
                            break
                        try:
                            string_data = chunk.decode('utf-8')
                        except:
                            string_data = chunk.decode('latin-1')
                        lines = string_data.split("\n")

                        for i, line in enumerate(lines[:-1]):
                            if i == 0:
                                line = previous_line + line
                            try:
                                object = ujson.loads(line)
                                if object["subreddit"] == 'trippinthroughtime':
                                    writer.writerow({'score': object.get('score',''), 'author': object.get('author',''), 'domain': object.get('domain',''), 'num_crossposts': object.get('num_crossposts',''), 'total_awards_received': object.get('total_awards_received',''), 'created_utc': object.get('created_utc',''), 'num_comments': object.get('num_comments',''), 'selftext': object.get('selftext',''), 'title': object.get('title',''), 'url': object.get('url',''), 'permalink': object.get('permalink',''), 'id': object.get('id',''), 'subreddit_subscribers': object.get('subreddit_subscribers',''),})   
                            except KeyError:
                                pass
                            except ValueError as e:
                                
                                pass
                # delete the file after it was used
                os.remove(f"dumpfiles/{filename}")
                processed_files += 1
                print(f"Processed files: {processed_files}/{total_files}")

                            



RS_2021-12.zst downloaded.
Processed files: 1/24
RS_2022-01.zst downloaded.
Processed files: 2/24
RS_2022-02.zst downloaded.
Processed files: 3/24
RS_2022-03.zst downloaded.
Processed files: 4/24
RS_2022-04.zst downloaded.
Processed files: 5/24
RS_2022-05.zst downloaded.
Processed files: 6/24
RS_2022-06.zst downloaded.
Processed files: 7/24
RS_2022-07.zst downloaded.
Processed files: 8/24
RS_2022-08.zst downloaded.
Processed files: 9/24
RS_2022-09.zst downloaded.
Processed files: 10/24
RS_2022-10.zst downloaded.
Processed files: 11/24
RS_2022-11.zst downloaded.
Processed files: 12/24
RS_2022-12.zst downloaded.
Processed files: 13/24


In [2]:
import pandas as pd

df_alex = pd.read_csv('../all_raw_csv/trippinthroughtime_data_all.csv', encoding='latin-1')
df_2015 = pd.read_csv('../all_raw_csv/trippinthroughtime_data2015.csv', encoding='latin-1')
df_2016 = pd.read_csv('../all_raw_csv/trippinthroughtime_data2016-2018.csv', encoding='latin-1')

In [3]:
print(df_alex.shape)
print(df_2015.shape)
print(df_2016.shape)

(40366, 14)
(1025, 14)
(14061, 14)


In [7]:
df_concat = pd.concat([df_alex, df_2015, df_2016])

In [8]:
print(df_concat.shape)

(55452, 14)


In [9]:
df_concat.columns

Index(['score', 'author', 'total_awards_received', 'created_utc',
       'num_comments', 'selftext', 'title', 'url', 'domain', 'permalink', 'id',
       'subreddit_subscribers', 'num_crossposts', 'relative_path'],
      dtype='object')

In [None]:
df_concat.to_csv('../all_raw_csv/trippinthroughtime_data_all_years.csv')