In [67]:
#STEP 1. Load and confirm if files exist and are being loaded properly
import os

directory = 'Batch_one'
files = [f for f in os.listdir(directory) if f.endswith('.csv')]
print(f"Found {len(files)} CSV files: {files[:5]}")  # Print first 5 files


Found 263 CSV files: ['filtered_steemit_2024-07-16.csv', 'filtered_steemit_2024-08-13.csv', 'filtered_steemit_2024-03-20.csv', 'filtered_steemit_2024-05-23.csv', 'filtered_steemit_2024-03-14.csv']


In [52]:
# STEP 2> Load the lexicon
def load_lexicon(lexicon_path):
    with open(lexicon_path, 'r') as f:
        lexicon = set(line.strip().split('\t')[0].lower() for line in f.readlines())  # Keep only words

    print(f"\n📖 Cleaned Lexicon Sample: {list(lexicon)[:10]}")  # Debug output
    return lexicon

lexicon_path = 'lexicon.txt'
lexicon = load_lexicon(lexicon_path)

print(f"Loaded {len(lexicon)} words from lexicon. Sample: {list(lexicon)[:10]}")  # Show 10 words



📖 Cleaned Lexicon Sample: ['lga', 'concomitants', 'ellas', 'remuneration', 'colds', 'cheke', 'platitudes', 'ehrhardt', 'eph', 'mackensen']
Loaded 97565 words from lexicon. Sample: ['lga', 'concomitants', 'ellas', 'remuneration', 'colds', 'cheke', 'platitudes', 'ehrhardt', 'eph', 'mackensen']


In [53]:
#STEP 3. read and display a sample CSV file
import pandas as pd

sample_file = os.path.join(directory, files[0])  # Pick the first CSV
df = pd.read_csv(sample_file)

print(df.head())  # Display first few rows
print(df.columns)  # Show column names


  timestamp                                              title  \
0   2024-07  SEC-S19 / W2 | Terminología culinaria y uso ad...   
1   2024-07  Cuando voy a Maracaibo...... Que molleja!!. Me...   
2   2024-07                                      빅워크에서 걸음 적립 중   
3   2024-07                                             손바닥 지압   
4   2024-07  The Diary Game | Lunes 15-07-2024 | Soy yo otr...   

                                                text  \
0  |![Picsart_24-07-16_19-56-35-552.jpg](https://...   
1  <hr>\n<hr>\n\n|![maracaibo-96833_1280.jpg](htt...   
2  ![](https://cdn.steemitimages.com/DQmTSMVJEkCq...   
3  며칠전 부터 양쪽 손가락이 아프고 살짝 붓는 느낌이 있어 마사지를 받고나니 좀 부드...   
4  <div class="text-justify">\n\n\n\n### <center>...   

                                   concatenated_text  
0  SEC-S19 / W2 | Terminología culinaria y uso ad...  
1  Cuando voy a Maracaibo...... Que molleja!!. Me...  
2  빅워크에서 걸음 적립 중 . ![](https://cdn.steemitimages....  
3  손바닥 지압 . 며칠전 부터 양쪽 손가락이 아프고 살짝 붓는 느낌이 있어 마사

In [54]:
import re

def clean_text(text):
    """Removes URLs, non-alphabetic characters, and extra spaces."""
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# 🔹 Test Step 1
sample_text = "The quick brown fox jumps over the lazy dog. Visit www.example.com for details!"
cleaned_text = clean_text(sample_text)
print(f"✅ Cleaned Text: {cleaned_text}")


✅ Cleaned Text: The quick brown fox jumps over the lazy dog Visit for details


In [55]:
import spacy

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

def apply_pos_tagging(text):
    """Applies POS tagging using spaCy and returns tokens with POS tags."""
    doc = nlp(text)
    return [(token.lemma_.lower(), token.pos_) for token in doc]

# 🔹 Test Step 2
tokens_with_pos = apply_pos_tagging(cleaned_text)
print(f"📝 Tokens with POS: {tokens_with_pos}")




📝 Tokens with POS: [('the', 'DET'), ('quick', 'ADJ'), ('brown', 'ADJ'), ('fox', 'NOUN'), ('jump', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN'), ('visit', 'NOUN'), ('for', 'ADP'), ('detail', 'NOUN')]


In [56]:
def filter_with_lexicon(tokens, lexicon):
    """Filters tokens based on the lexicon and POS constraints."""
    valid_words = [
        word for word, pos in tokens
        if word in lexicon and pos in ['NOUN', 'VERB', 'ADJ', 'ADV']
    ]
    return valid_words

# 🔹 Example Lexicon
lexicon = {"quick", "brown", "fox", "jumps", "lazy", "dog"}  # Sample words

# 🔹 Test Step 3
filtered_words = filter_with_lexicon(tokens_with_pos, lexicon)
print(f"✅ Filtered Words: {filtered_words}")


✅ Filtered Words: ['quick', 'brown', 'fox', 'lazy', 'dog']


In [57]:
def process_text_pipeline(text, lexicon):
    """Full pipeline: Clean text -> POS tagging -> Filter with lexicon."""
    text = clean_text(text)
    print(f"\n🔹 Cleaned Text: {text}")  # Debug output

    tokens_with_pos = apply_pos_tagging(text)
    print(f"\n📝 Tokens with POS: {tokens_with_pos}")  # Debug output

    filtered_words = filter_with_lexicon(tokens_with_pos, lexicon)
    print(f"\n✅ Filtered Words: {filtered_words}")  # Debug output

    return filtered_words

# 🔹 Test Full Pipeline
process_text_pipeline(sample_text, lexicon)



🔹 Cleaned Text: The quick brown fox jumps over the lazy dog Visit for details

📝 Tokens with POS: [('the', 'DET'), ('quick', 'ADJ'), ('brown', 'ADJ'), ('fox', 'NOUN'), ('jump', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN'), ('visit', 'NOUN'), ('for', 'ADP'), ('detail', 'NOUN')]

✅ Filtered Words: ['quick', 'brown', 'fox', 'lazy', 'dog']


['quick', 'brown', 'fox', 'lazy', 'dog']

In [59]:
import pandas as pd
from collections import defaultdict

def process_chunk(file, lexicon):
    """Processes a CSV file: Clean text -> POS tagging -> Lexicon filtering -> Word mapping."""
    df = pd.read_csv(file)
    
    word_freq = defaultdict(int)  # Store word frequencies
    word_map = []  # Store word-to-timestamp mapping
    
    for idx, row in df.iterrows():
        # Extract text and timestamp
        timestamp = row['timestamp']
        text = row['concatenated_text']

        # Process the text through the updated pipeline
        filtered_words = process_text_pipeline(text, lexicon)  # Uses our verified function
        
        # Debugging output
        print(f"\n📌 Processed row {idx+1} in {file}: {filtered_words}")

        # Update word frequency & mapping
        for word in filtered_words:
            word_freq[word] += 1
            word_map.append((word, timestamp, row['title']))

    return word_freq, word_map


In [None]:
sample_file = "Batch_one/filtered_steemit_2024-03-11.csv"  # Change this to an actual CSV in your folder
word_freq, word_map = process_chunk(sample_file, lexicon)

print("\n✅ Word Frequencies:", word_freq)
print("\n✅ Word Map Sample:", word_map[:5])  # Print first 5 mappings



🔹 Cleaned Text: melakukan gotong royong bersama membersihkan tempat ibadah mesjid stemmanianselamat pagi semoga kita semua dalam keadaan baik sajandilancarkan segala aktivitas yang kita kerjakan nnpada kesempatan kali ini saya ingin memposting kegaiatan rutin kami di saat menjelang bulan ramadhan yaitu melakukan gotong royong bersama membersihkan mesjid nnsudah menjadi kebiasaa warga desa kami sebelum bulan suci ramadhan tiba kami akan membersihkan tempat ibadah kami agar nyaman di saat kami melakukan ibadah di bulan ramdhan nnkami membersihakan lantai lantai mesjid membersihkan halman mesjid dan juga membersihkan karpet karpet mesjid nncukup demikian postingan saya pada hari yang berbahagia ininsemoga hari hari kita menyenangkan salam stemmaniaa ncobrannWhereIn Android

📝 Tokens with POS: [('melakukan', 'NOUN'), ('gotong', 'NOUN'), ('royong', 'NOUN'), ('bersama', 'NOUN'), ('membersihkan', 'NOUN'), ('tempat', 'VERB'), ('ibadah', 'NOUN'), ('mesjid', 'NOUN'), ('stemmanianselamat', 'NOUN

In [66]:
# Convert lexicon to lowercase for consistency
lexicon = {word.lower() for word in lexicon}

# Process and clean the text, then convert to lowercase
processed_text = clean_text(text).lower()

# Split into words and filter only those in the lexicon
words_in_text = set(processed_text.split())
filtered_words = words_in_text & lexicon  # Intersection of words in text and lexicon

# Print results
print("\nTotal Words in Text:", len(words_in_text))
print("\nTotal Filtered Words (Matching Lexicon):", len(filtered_words))
print("\nFiltered Words:\n", filtered_words)



Total Words in Text: 170

Total Filtered Words (Matching Lexicon): 0

Filtered Words:
 set()
