In [None]:

#PREPROCESS STEEMIT DATA SET TO GENERATE TARGET COLUMNS

In [3]:
import pandas as pd
import os
import concurrent.futures
from tqdm import tqdm

# Define input and output folders
input_folder = "steemit_tsv.zip_extracted"
batch_one_folder = "Batch_one"

# Ensure output directory exists
os.makedirs(batch_one_folder, exist_ok=True)

def clean_date(timestamp):
    """Convert timestamp to YYYY-MM format."""
    try:
        return pd.to_datetime(float(timestamp), unit='s').strftime("%Y-%m")
    except ValueError:
        return "unknown"

def process_file(file_name):
    """Process a single file and save the cleaned data."""
    file_path = os.path.join(input_folder, file_name)
    output_path = os.path.join(batch_one_folder, f"filtered_{file_name.replace('.tsv', '.csv')}")
    
    try:
        processed_chunks = []
        for chunk in pd.read_csv(file_path, sep="\t", header=None, on_bad_lines="skip", encoding="utf-8", low_memory=True, chunksize=10000):
            chunk.columns = [f"index{i}" for i in range(chunk.shape[1])]
            
            if "index2" not in chunk or "index1" not in chunk:
                continue
            
            filtered_chunk = chunk[chunk["index2"].str.lower() == "comment"]
            selected_columns = filtered_chunk[["index1", "index9", "index10"]].copy()
            
            selected_columns.rename(columns={"index1": "timestamp", "index9": "title", "index10": "text"}, inplace=True)
            selected_columns["timestamp"] = selected_columns["timestamp"].apply(clean_date)
            selected_columns[["title", "text"]] = selected_columns[["title", "text"]].fillna("N/A")
            selected_columns["concatenated_text"] = selected_columns["title"] + " . " + selected_columns["text"]
            selected_columns = selected_columns[~selected_columns["concatenated_text"].str.contains("N/A", na=False)]
            processed_chunks.append(selected_columns)
        
        if processed_chunks:
            final_df = pd.concat(processed_chunks, ignore_index=True)
            final_df.to_csv(output_path, index=False)
            return output_path
    except Exception as e:
        return f"Error processing {file_name}: {e}"

# Process all files in parallel
files = [f for f in os.listdir(input_folder) if f.endswith('.tsv')]
print(f"Processing {len(files)} files...")

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_file, files), total=len(files), desc="Processing files"))

# Display a sample processed DataFrame
sample_file = os.path.join(batch_one_folder, os.listdir(batch_one_folder)[0])
if os.path.exists(sample_file):
    sample_df = pd.read_csv(sample_file, nrows=5)
    print("\nSample Processed DataFrame:")
    print(sample_df.head())


Processing 263 files...


Processing files: 100%|██████████| 263/263 [24:36<00:00,  5.62s/it]  


Sample Processed DataFrame:
  timestamp                                              title  \
0   2024-07  SEC-S19 / W2 | Terminología culinaria y uso ad...   
1   2024-07  Cuando voy a Maracaibo...... Que molleja!!. Me...   
2   2024-07                                      빅워크에서 걸음 적립 중   
3   2024-07                                             손바닥 지압   
4   2024-07  The Diary Game | Lunes 15-07-2024 | Soy yo otr...   

                                                text  \
0  |![Picsart_24-07-16_19-56-35-552.jpg](https://...   
1  <hr>\n<hr>\n\n|![maracaibo-96833_1280.jpg](htt...   
2  ![](https://cdn.steemitimages.com/DQmTSMVJEkCq...   
3  며칠전 부터 양쪽 손가락이 아프고 살짝 붓는 느낌이 있어 마사지를 받고나니 좀 부드...   
4  <div class="text-justify">\n\n\n\n### <center>...   

                                   concatenated_text  
0  SEC-S19 / W2 | Terminología culinaria y uso ad...  
1  Cuando voy a Maracaibo...... Que molleja!!. Me...  
2  빅워크에서 걸음 적립 중 . ![](https://cdn.steemitimages....  
3  손바닥 지압 . 며칠전 부


