In [40]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

In [41]:
raw_data = pd.read_csv('../data/raw_data.csv')
data = raw_data.sample(n=3000, random_state=42)
data['has_been_labeled'] = 0
data['market_viability'] = 0
data = data.drop(columns=['link_flair_text'])
data = data.dropna(subset=['body'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2953 entries, 4871 to 3647
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  2953 non-null   object 
 1   body                   2953 non-null   object 
 2   subreddit              2953 non-null   object 
 3   url                    2953 non-null   object 
 4   score                  2953 non-null   int64  
 5   num_comments           2953 non-null   int64  
 6   upvote_ratio           2953 non-null   float64
 7   created_utc            2953 non-null   float64
 8   author                 2953 non-null   object 
 9   is_original_content    2953 non-null   bool   
 10  edited                 2953 non-null   bool   
 11  total_awards_received  2953 non-null   int64  
 12  gilded                 2953 non-null   int64  
 13  search_query           2953 non-null   object 
 14  has_been_labeled       2953 non-null   int64  
 15  market

In [44]:

current_index = 0
post_output = widgets.HTML()
status_output = widgets.HTML()

def create_post_html(post):
    return f"""
    <div style='margin-bottom: 20px'>
        <h3>Post {current_index + 1} of {len(filtered_data())}</h3>
        <h4>Title:</h4> {post['title']}
        <h4>Body:</h4> {post['body'][:1500]}{"..." if len(str(post['body'])) > 1500 else ""}
    </div>
    """

def filtered_data():
    return data[data['has_been_labeled'] != 1]

def update_status():
    labeled = (data['has_been_labeled'] == 1).sum()
    viable = (data['market_viability'] == 1).sum() if 'market_viability' in data.columns else 0

    unlabeled = len(data) - labeled
    status_output.value = f"""
    <p>Progress: {labeled} labeled, {unlabeled} remaining
       <br>Viable: {viable} identified
    """

def show_post():
    if len(filtered_data()) > 0:
        post = filtered_data().iloc[current_index]
        post_output.value = create_post_html(post)

    else:
        post_output.value = "<h3>No posts</h3>"
    update_status()

def on_next_clicked(b):
    global current_index
    current_post_idx = filtered_data().index[current_index]
    data.at[current_post_idx, 'market_viability'] = market_viability.value
    data.at[current_post_idx, 'has_been_labeled'] = 1

    auto_save()
    filtered = filtered_data()
    if len(filtered) > 0 and current_index < len(filtered) - 1:
        current_index += 1
    elif len(filtered) == 0:
        post_output.value = "<h3>No more unlabeled posts!</h3>"
    else:
        current_index = max(0, len(filtered) - 1)

    show_post()

def on_prev_clicked(b):
    global current_index
    if current_index > 0:
        current_index -= 1
        show_post()

def on_label_change(change):
    pass

def auto_save():
    data.to_csv('../data/labeled_post.csv', index=False)
    update_status()

# Create widgets
prev_button = widgets.Button(
    description='Previous',
    button_style='info',
    layout=widgets.Layout(width='100px')
)
next_button = widgets.Button(
    description='Next',
    button_style='info',
    layout=widgets.Layout(width='100px')
)

market_viability = widgets.Dropdown(
    options=[0, 1],
    description='Market Viability:',
    layout=widgets.Layout(width='300px')
)


# Register callbacks
prev_button.on_click(on_prev_clicked)
next_button.on_click(on_next_clicked)
market_viability.observe(on_label_change, names='value')

# Create the layout
navigation = widgets.HBox([prev_button, next_button])
controls = widgets.VBox([market_viability])
layout = widgets.VBox([post_output, status_output, controls, navigation])

# Initial display
show_post()
display(layout)

VBox(children=(HTML(value="\n    <div style='margin-bottom: 20px'>\n        <h3>Post 1 of 2953</h3>\n        <…