In [None]:
import pandas as pd
import os
import numpy as np
import pyarrow.parquet as pq

# Paths
input_folder = "/replace_with_folder_path"
output_folder = "Merged_Parquet"
output_file = os.path.join(output_folder, "merged_sample.parquet")
os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

# Initialize list to store sampled data
all_samples = []

# Step 1: Sample 25% from each file
for file in os.listdir(input_folder):
    if file.endswith(".parquet"):
        file_path = os.path.join(input_folder, file)
        
        # Read Parquet file
        df = pd.read_parquet(file_path)
        
        # Randomly sample 25% of rows
        df_sampled = df.sample(frac=0.25, random_state=42)
        
        all_samples.append(df_sampled)

# Step 2: Merge all sampled data
merged_df = pd.concat(all_samples, ignore_index=True)

# Step 3: Reduce dataset if it exceeds 50MB
max_size = 50 * 1024 * 1024  # 50MB in bytes
temp_output = os.path.join(output_folder, "temp.parquet")

# Save temp file to check size
merged_df.to_parquet(temp_output, compression="zstd", index=False)
file_size = os.path.getsize(temp_output)

# If file is larger than 50MB, downsample further
if file_size > max_size:
    print(f"‚ö†Ô∏è File too large ({file_size / (1024 * 1024):.2f}MB), reducing size...")
    
    # Calculate necessary fraction to stay under 50MB
    fraction = max_size / file_size
    merged_df = merged_df.sample(frac=fraction, random_state=42)  # Reduce further

# Step 4: Sort by timestamp (if needed)
if "timestamp" in merged_df.columns:
    merged_df = merged_df.sort_values(by="timestamp").reset_index(drop=True)

# Step 5: Save final Parquet file
merged_df.to_parquet(output_file, compression="zstd", index=False)

# Remove temp file
os.remove(temp_output)

print(f"‚úÖ Final dataset saved: {output_file} ({os.path.getsize(output_file) / (1024 * 1024):.2f}MB, {len(merged_df)} rows)")


‚ö†Ô∏è File too large (572.59MB), reducing size...
‚úÖ Final dataset saved: Merged_Parquet/merged_sample.parquet (49.97MB, 21538 rows)


In [2]:
import os
import pandas as pd
import pyarrow.parquet as pq

In [None]:

parquet_file = ('/replace_with _path_to_file/merged_sample.parquet')
    # Load Parquet file metadata
table = pq.read_table(parquet_file)
print("Schema:", table.schema)
print("Number of Rows:", table.num_rows)

    # Load into pandas and show a shuffled preview
df = pd.read_parquet(parquet_file)
print("\nüîç Sample Data (Shuffled):")
print(df.sample(min(5, len(df)), random_state=42))  # Show up to 10 rows, shuffled


Schema: timestamp: string
title: string
text: string
concatenated_text: string
source_file: string
year_month: string
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 787
Number of Rows: 21538

üîç Sample Data (Shuffled):
      timestamp                                              title  \
2041    2024-04                                         1915Âπ¥4Êúà24Êó•   
5701    2024-05  WEEKLY REPORT - N.194 - Italy Community Update...   
11942   2024-07                                        crystal liu   
2150    2024-04                                        crystal liu   
16778   2024-09  Daily top posts in category: poetry on 2024-09-01   

                                                    text  \
2041   1915Âπ¥4Êúà24Êó•ÔºåËëóÂêçÊº´ÁîªÂÆ∂ÂçéÂêõÊ≠¶Âá∫Áîü„ÄÇ\nÂçéÂêõÊ≠¶Ôºà1915Âπ¥4Êúà24Êó•Ôºç2010Âπ¥6Êúà...   
5701   ![2 (75).jpg](https://cdn.steemitimages.com/DQ...   
11942        https://www.youtube.com/watch?v=MMyK6AW6u8Y   
2150      

In [16]:
df.drop(columns=['title', 'text', 'source_file', 'year_month'], errors='ignore')

Unnamed: 0,timestamp,concatenated_text
0,2024-03,"Diary Game for Monday, 18th March, 2024 | Upl..."
1,2024-03,Upvote . Upvoted. Thank You for sending some o...
2,2024-03,Check my latest fight ! redwarrior vs The Gov...
3,2024-03,The Diary Game (27-03-24) De guardia en C.O üòÅ ...
4,2024-03,Upvote . Upvoted. Thank You for sending some o...
...,...,...
21533,2024-11,Me est√°n esperando . <div class=text-justify> ...
21534,2024-11,"–ê bambuka-—Ç–æ –±—ã–ª –ø—Ä–∞–≤ . @@ -942,16 +942,17 @@\..."
21535,2024-11,Î≥¥Îäî Ïû¨ÎØ∏ . ÏΩîÏù∏ÏãúÏû•Ïù¥ ÌôúÍ∏∞Î•º Î≥¥Ïù¥Í≥† ÏûàÏñ¥ Íµ¨Í≤ΩÌïòÎäî Ïû¨ÎØ∏Í∞Ä ÏûàÎÑ§Ïöî\nÌäπÌûà ÎèÑÏßÄÏΩîÏù∏...
21536,2024-11,Israel Increases Pressure On Syria . ![Israel_...


In [17]:
random_rows = df.sample(5)
for index, row in random_rows.iterrows():
    print(row['concatenated_text'])
    print("------------------------------")

Tony Hawk‚Äôs American Wasteland Save File (PC Game Data) . <center><img decoding="async" src="https://gamedl.download/savepics/tony-hawks-american-wasteland-cover.jpg"/></center><p>If you want to simply download and install the Tony Hawk‚Äôs American Wasteland Save File (PC Game Data) for the game, just click the button bellow, run the exe file and your save data should be automatically loaded. You can know more about the game and the save file information bellow.</p>\n<center><a href="https://gamedl.download/game-save-download"><img src="https://gamedl.download/wp-content/uploads/2024/05/download.png"/></a></center><p></p>\n<ul>\n<li>Developers: Neversoft</li>\n<li>Release Date: October 18, 2005</li>\n<li>Genres: Sports, Skateboarding</li>\n<li>Platforms: PlayStation 2, Xbox, Xbox 360, GameCube, PC</li>\n<li>Publisher: Activision</li>\n</ul>\n<h2>About Tony Hawk‚Äôs American Wasteland</h2>\n<p>Tony Hawk‚Äôs American Wasteland is a skateboarding video game that marks a significant ent

In [1]:
import gc



In [None]:
import fasttext
model_path = '/Replace_with_path_to/cc.en.300.bin'


model = fasttext.load_model('/Replace_with_your_model_path/cc.en.300.bin')

print(model.words[:10])  # Show first 10 words in the vocabulary


[',', 'the', '.', 'and', 'to', 'of', 'a', '</s>', 'in', 'is']


In [3]:
import os
import re

import pandas as pd



In [None]:
# Define input and output folders
input_folder = "/Replace_with_your_path/Merged_Parquet"
output_folder = "Cleaned_Files"
os.makedirs(output_folder, exist_ok=True)


In [5]:


# Function to clean text with FastText
def process_text(text):
    """Removes unwanted characters and keeps only words in FastText vocabulary."""
    if not isinstance(text, str):
        return ""  # Handle non-string values

    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    words = text.split()
    filtered_words = [word for word in words if word in model.words]  # Keep only known words
    return " ".join(filtered_words)

# Process each Parquet file
for parquet_file in os.listdir(input_folder):
    if parquet_file.endswith(".parquet"):
        input_path = os.path.join(input_folder, parquet_file)
        output_path = os.path.join(output_folder, parquet_file)  # Save with same name

        # Read the dataset
        df = pd.read_parquet(input_path, columns=["timestamp", "concatenated_text"])

        # Apply text cleaning
        df["processed_text"] = df["concatenated_text"].apply(process_text)

        # Drop the original column
        df = df[["timestamp", "processed_text"]]

        # Save cleaned data
        df.to_parquet(output_path, index=False)

        # Free memory
        del df
        gc.collect()

        print(f"‚úÖ Processed: {parquet_file}")

print("üéâ FastText cleaning completed!")


‚úÖ Processed: merged_sample.parquet
üéâ FastText cleaning completed!


In [None]:
df1 = pd.read_parquet('/Replace_with_your_path/merged_sample.parquet')
df1.head()

Unnamed: 0,timestamp,processed_text
0,2024-03,Diary Game for Monday th March Uploaded my boo...
1,2024-03,Upvote Upvoted Thank You for sending some of y...
2,2024-03,Check my latest fight vs The Goverment a href src
3,2024-03,The Diary Game De guardia en CO Steemit Y los ...
4,2024-03,Upvote Upvoted Thank You for sending some of y...


In [11]:

rows_count = df1.shape[:1]
rows_count

(21538,)

In [13]:
random_rows = df1.sample(5)
for index, row in random_rows.iterrows():
    print(row['processed_text'])
    print("------------------------------")

Descargar Farming Simulator PC Gratis Directo y Torrent Farming Simulator PC juego gratis en espaol directamente o a de Torrent La del juego est para una descarga e y que los tiempos de ms posibles Puedes acceder a Farming Simulator a de los enlaces directos o torrent que te a todos ellos seguros libres de cracks y sin tipo de virus Slo tienes que pulsar el botn de descarga el archivo de y seguir las instrucciones que se postid src de agricultura de de noviembre de href de descarga src del Simulator te permite en un moderno construir y tu granja en tres lugares diferentes de Amrica y Europa El juego tiene muchas actividades como cultivar cuidar animales y talar Ahora puedes experimentar diferentes estaciones en el ms de y herramientas de ms de marcas reales como John Deere CLAAS y New Holland Puedes cultivar como trigo maz patatas y Nuevos tipos de y hacen que el juego sea an ms divertido e puedes jugar con amigos en el modo Hay muchas por la comunidad que puedes agregar al juego Farmi

In [45]:
import textwrap
text = df1.loc[21537, 'processed_text']
wrapped_text = textwrap.fill(text, width=80)
print(wrapped_text)

My Different Baby pa href img src Different Baby owned a block of cheese that
they would sometimes embrace This seems a fairly peculiar undertaking to many
but not to the Baby who had come to the conclusion that the idea was miraculous
Bizarrely a block of cheese having that done to Different Baby desperately
searched for a Harry Potter wand that they liked to hug This might appear to be
a fairly exciting activity to many but not to the Baby who expected that this
idea is wonderful Who would have imagined a Harry Potter wand being the thing
opted Different Baby had a pen to hug It is considered to be a surprising
exercise to everybody you know but not to the Baby who felt that the idea was
the most awesome idea You wouldnt have imagined a pen being the chosen Different
Baby always had with them a pen that they sometimes cuddle It was considered to
be an original operation to my children but not to the Baby who had decided that
the idea was miraculous A pen having that done to Different

In [44]:
import textwrap
text = df.loc[21537, 'concatenated_text']
wrapped_text = textwrap.fill(text, width=80)
print(wrapped_text)

My Different Baby . <p><a href=https://cdn.pixabay.com/photo/2021/04/27/16/12/su
cculents-6211878_960_720.png ><img src=https://cdn.pixabay.com/photo/2021/04/27/
16/12/succulents-6211878_960_720.png /></a></p><p>My Different Baby owned a
block of cheese that they would sometimes embrace. This seems a fairly peculiar
undertaking, to many, but not to the  Baby, who  had come to the conclusion that
the idea was miraculous. Bizarrely, a block of cheese having that done to
it</p><p>My Different Baby desperately searched for a Harry Potter wand that
they liked to hug. This might appear to be a fairly exciting activity, to many,
but not to the  Baby, who  expected that this idea is wonderful. Who would have
imagined, a Harry Potter wand being the thing opted for.</p><p>My Different Baby
had a pen to hug. It is considered to be a surprising exercise, to everybody you
know, but not to the  Baby, who  felt that the idea was the most awesome idea.
You wouldnt have imagined, a pen being the chosen

5. ### SENTIMENT ANALYSIS AND ATTACH SENTIMENT SCORES AND SENTIMENT CATEGORIES

The goal in this stage is to have topics that can have some meaning as the predicted topics will be evaluated against the sentiment scores 
abd lebels. The topics are also to be defined and compared based on the sentiment characteristics.

In [46]:
import os
import gc
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

In [47]:
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
torch.manual_seed(44)

Using device: cpu


<torch._C.Generator at 0x1340b1830>

In [48]:
# Load sentiment analysis model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = BertForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:



# Define input/output directories
input_folder = "/Users/Replace_with_your_path/Cleaned_Files"
output_folder = "Bert_Sentiment_Results"
os.makedirs(output_folder, exist_ok=True)

In [50]:
# Custom Dataset Class
class SentimentDataset(Dataset):
    """Dataset class for sentiment analysis."""
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['processed_text']
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

# Function to analyze sentiment
def analyze_sentiment_batch(batch):
    """Analyze sentiment for a batch of text inputs."""
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    return predictions

In [51]:

# Function to process each cleaned Parquet file
def process_parquet_file(parquet_path):
    output_path = os.path.join(output_folder, os.path.basename(parquet_path).replace(".parquet", "_sentiment.csv"))
    
    df = pd.read_parquet(parquet_path)

    chunk_size = 1000  # Adjust based on memory capacity
    first_chunk = True

    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start+chunk_size]

        dataset = SentimentDataset(chunk, tokenizer)
        dataloader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=lambda x: {
            'input_ids': torch.stack([item['input_ids'] for item in x]),
            'attention_mask': torch.stack([item['attention_mask'] for item in x])
        })

        predictions = []
        for batch in tqdm(dataloader, desc="Processing batch"):
            batch_predictions = analyze_sentiment_batch(batch)
            predictions.extend(batch_predictions)

        chunk.loc[:, 'sentiment_score'] = predictions  # Use .loc to avoid the warning
        chunk.loc[:, 'sentiment_category'] = chunk['sentiment_score'].apply(lambda x: {
    0: 'negative',
    1: 'negative',
    2: 'neutral',
    3: 'positive',
    4: 'positive'
}.get(x))
        
        if first_chunk:
            chunk.to_csv(output_path, mode='w', index=False)
            first_chunk = False
        else:
            chunk.to_csv(output_path, mode='a', index=False, header=False)

        del chunk
        gc.collect()

    print(f"Processed {parquet_path} ‚Üí Saved to {output_path}")

# Process all cleaned text files
for parquet_file in os.listdir(input_folder):
    if parquet_file.endswith(".parquet"):
        process_parquet_file(os.path.join(input_folder, parquet_file))

print("‚úÖ Sentiment analysis completed.")


Processing batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [25:16<00:00, 24.07s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.loc[:, 'sentiment_category'] = chunk['sentiment_score'].apply(lambda x: {
Processing batch: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [25:32<00:00, 24.32s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

Processed /Users/samsonbobo/Desktop/Research Topic/Thesis/Cleaned_Files/merged_sample.parquet ‚Üí Saved to Bert_Sentiment_Results/merged_sample_sentiment.csv
‚úÖ Sentiment analysis completed.


In [None]:
df2 = pd.read_csv('/Users/Replace_with_your_path/merged_sample_sentiment.csv')
df2.head()

Unnamed: 0,timestamp,processed_text,sentiment_score,sentiment_category
0,2024-03,Diary Game for Monday th March Uploaded my boo...,4,positive
1,2024-03,Upvote Upvoted Thank You for sending some of y...,0,negative
2,2024-03,Check my latest fight vs The Goverment a href src,4,positive
3,2024-03,The Diary Game De guardia en CO Steemit Y los ...,4,positive
4,2024-03,Upvote Upvoted Thank You for sending some of y...,0,negative
