In [15]:
#pip install chardet


In [26]:
import chardet

In [27]:
import pandas as pd

In [28]:


# Read a small portion of the file to detect encoding
with open("steemit_tsv.zip", "rb") as f:
    raw_data = f.read(10000)  # Read first 10KB of data
result = chardet.detect(raw_data)
print(result)  # Output will contain the detected encoding


{'encoding': None, 'confidence': 0.0, 'language': None}


In [37]:
import pandas as pd
import os
import time

# Measure the total processing time
start_time = time.time()

# Input folder containing the .tsv files
input_folder = "steemit_tsv.zip_extracted"

# Output folder for filtered results
output_folder = "steemit_tsv_filtered_output"
os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist

# Process files one by one
files = [f for f in os.listdir(input_folder) if f.endswith(".tsv")]

for i, file_name in enumerate(files):
    file_path = os.path.join(input_folder, file_name)
    try:
        print(f"Processing file {i + 1}/{len(files)}: {file_name}")

        # Read the file, skipping invalid lines
        df = pd.read_csv(file_path, sep="\t", header=None, on_bad_lines="skip", encoding="utf-8")

        # Rename columns for easier access
        df.columns = [f"index{i}" for i in range(df.shape[1])]

        # Filter rows where index2 is "comment"
        filtered_df = df[df["index2"].str.lower() == "comment"]

        # Select relevant columns: index1 (timestamp), index9 (title), index10 (input)
        selected_columns = filtered_df[["index1", "index9", "index10"]].copy()

        # Rename columns
        selected_columns.rename(
            columns={"index1": "timestamp", "index9": "title", "index10": "input"},
            inplace=True,
        )

        # Replace NaN values in "title" and "input" for safe concatenation
        selected_columns["title"] = selected_columns["title"].fillna("N/A")
        selected_columns["input"] = selected_columns["input"].fillna("N/A")

        # Create the "text" column by concatenating title and input
        selected_columns["text"] = selected_columns["title"] + ". " + selected_columns["input"]

        # Create the "concatenated_text" column
        selected_columns["concatenated_text"] = selected_columns["title"] + " . " + selected_columns["text"]

        # Replace NaN values in the original columns for clarity
        selected_columns = selected_columns.fillna("N/A")

        # Rearrange columns for clarity
        output_df = selected_columns[["timestamp", "title", "text", "concatenated_text"]]

        # Save the output to a CSV file in the output folder
        output_file = os.path.join(output_folder, f"filtered_{file_name.replace('.tsv', '.csv')}")
        output_df.to_csv(output_file, index=False)
        print(f"Filtered comments saved to: {output_file}")

        # Print a sample of the first file (first 10 rows only)
        if i == 0:
            print("\nSample output for the first file:")
            print(output_df.head(10).to_string(index=False))
            print()

    except FileNotFoundError:
        print(f"File '{file_name}' not found in the folder '{input_folder}'. Skipping...")
    except pd.errors.ParserError as e:
        print(f"Error parsing file '{file_name}': {e}. Skipping...")
    except Exception as e:
        print(f"Unexpected error with file '{file_name}': {e}. Skipping...")

end_time = time.time()
total_time = end_time - start_time

print(f"\nTotal time to process all files: {total_time:.2f} seconds.")



Processing file 1/263: steemit_2024-07-21.tsv
Filtered comments saved to: steemit_tsv_filtered_output/filtered_steemit_2024-07-21.csv

Sample output for the first file:
 timestamp                                            title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    