In [26]:
import re
import html
import pandas as pd

# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

In [27]:
data = pd.read_csv('../data/labeled_post.csv', index_col='id')

In [28]:
data.drop(['url', 'created_datetime', 'query_category', 'time_to_mvp', 'business_model', 'has_been_labeled'], axis=1, inplace=True)

In [29]:
def check_and_convert_text(text):
    if pd.isna(text) or text is None:
        return ""
    if not isinstance(text, str):
        return str(text)
    return text


def clean_html_entities(text):
    """Unescape HTML entities."""
    text = check_and_convert_text(text)
    return html.unescape(text)


def clean_escaped_chars(text):
    """Replace escaped characters with actual ones."""
    text = check_and_convert_text(text)
    # Replace literal "\n", "\r" etc with spaces
    text = text.replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')
    # Clean up any escaped markdown that may linger
    text = text.replace('\\*', '').replace('\\\\', '').replace('\\"', '').replace("\\'", '')
    return text


def remove_urls(text):
    """Remove URLs including both http and non-http prefixed ones."""
    text = check_and_convert_text(text)
    # Remove URLs with http/https/www
    text = re.sub(r'https?://\S+|www\.\S+', '', text, flags=re.MULTILINE)
    # Remove common domain patterns without http prefix
    text = re.sub(r'\S+\.(com|org|net|io|edu|gov|co)\S*', '', text, flags=re.IGNORECASE)
    return text


def remove_reddit_elements(text):
    """Remove Reddit-specific elements like subreddit references and usernames."""
    text = check_and_convert_text(text)
    # Remove subreddit references
    text = re.sub(r'(^|\s)/?r/\w+', ' ', text)
    # Remove username mentions
    text = re.sub(r'(^|\s)/?u/\w+', ' ', text)
    return text


def remove_markdown(text):
    """Remove markdown formatting like bold, italic, links, etc."""
    text = check_and_convert_text(text)
    # Links with display text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    # Headers (# Header)
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
    # Bold, italic, strikethrough
    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # Bold
    text = re.sub(r'\*([^*]+)\*', r'\1', text)  # Italic
    text = re.sub(r'~~([^~]+)~~', r'\1', text)  # Strikethrough
    text = re.sub(r'__([^_]+)__', r'\1', text)  # Bold (alternate)
    text = re.sub(r'_([^_]+)_', r'\1', text)  # Italic (alternate)
    return text


def remove_quotes_and_code(text):
    """Remove quote blocks and code blocks."""
    text = check_and_convert_text(text)
    # Remove quote blocks
    text = re.sub(r'^>.*$', '', text, flags=re.MULTILINE)
    # Remove inline code
    text = re.sub(r'`[^`]*`', '', text)
    # Remove code blocks
    text = re.sub(r'```[\s\S]*?```', '', text)
    return text


def clean_special_elements(text):
    """Remove or clean special elements like bullet points, hashtags."""
    text = check_and_convert_text(text)
    # Replace bullet points with standard form
    text = re.sub(r'^\s*[\*\+\-]\s+', '• ', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s+', '• ', text, flags=re.MULTILINE)
    # Remove hashtags symbol but keep the text
    text = re.sub(r'#(\w+)', r'\1', text)
    # Remove emojis
    text = re.sub(r':[a-z_]+:', '', text)
    return text


def clean_special_chars(text):
    """Remove special characters but keep meaningful punctuation."""
    text = check_and_convert_text(text)
    # Keep alphanumerics and important punctuation
    text = re.sub(r'[^\w\s\.\,\!\?\-\']', ' ', text)
    # Remove repeated punctuation
    text = re.sub(r'([.!?,;]){2,}', r'\1', text)
    return text


def convert_to_lowercase(text):
    """Convert text to lowercase for uncased models."""
    text = check_and_convert_text(text)
    return text.lower()


def normalize_whitespace(text):
    """Normalize whitespace to single spaces and strip."""
    text = check_and_convert_text(text)
    # Replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def truncate_for_bert(text, max_words=450):
    """Truncate text to fit BERT's token limit (approximation)."""
    text = check_and_convert_text(text)
    words = text.split()
    if len(words) > max_words:
        return ' '.join(words[:max_words])
    return text


def clean_text_for_bert_uncased(text):
    text = check_and_convert_text(text)

    # Apply cleaning steps in sequence
    text = clean_html_entities(text)
    text = clean_escaped_chars(text)
    text = remove_urls(text)
    text = remove_reddit_elements(text)
    text = remove_markdown(text)
    text = remove_quotes_and_code(text)
    text = clean_special_elements(text)
    text = clean_special_chars(text)
    text = convert_to_lowercase(text)
    text = normalize_whitespace(text)
    text = truncate_for_bert(text)

    return text

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 0 to 497
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  498 non-null    object 
 1   body                   498 non-null    object 
 2   subreddit              498 non-null    object 
 3   score                  498 non-null    int64  
 4   num_comments           498 non-null    int64  
 5   upvote_ratio           498 non-null    float64
 6   created_utc            498 non-null    float64
 7   author                 498 non-null    object 
 8   is_original_content    498 non-null    bool   
 9   edited                 498 non-null    bool   
 10  total_awards_received  498 non-null    int64  
 11  gilded                 498 non-null    int64  
 12  search_query           498 non-null    object 
 13  title_length           498 non-null    int64  
 14  body_length            498 non-null    int64  
 15  title_word_

In [33]:
data.drop('author', axis=1, inplace=True)
data['market_viability'] = data['market_viability'].map({'Viable': 1, 'Not Viable': 0})
data['post_day'] = data['post_day'].map(
    {
    'monday': 0,
    'tuesday': 1,
    'wednesday': 2,
    'thursday': 3,
    'friday': 4,
    'saturday': 5,
    'sunday': 7
    }
)

In [34]:
data['title_num_of_question_marks'] = data['title'].str.count('\?')
data['body_num_of_question_marks'] = data['body'].str.count('\?')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 0 to 497
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   title                        498 non-null    object 
 1   body                         498 non-null    object 
 2   subreddit                    498 non-null    object 
 3   score                        498 non-null    int64  
 4   num_comments                 498 non-null    int64  
 5   upvote_ratio                 498 non-null    float64
 6   created_utc                  498 non-null    float64
 7   is_original_content          498 non-null    bool   
 8   edited                       498 non-null    bool   
 9   total_awards_received        498 non-null    int64  
 10  gilded                       498 non-null    int64  
 11  search_query                 498 non-null    object 
 12  title_length                 498 non-null    int64  
 13  body_length              

In [35]:
data.to_csv('../data/labeled_post_cleaned.csv')