# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [11]:
import re
import time
from bs4 import BeautifulSoup
import requests as requests
from scripts import item 
from scripts import discussion 
from scripts import link
from scripts import comment
from scripts import user
from tqdm import tqdm
import sqlite3
import pandas as pd

### Fix wrongly retrieved rows
Some posts and comments have been wrongfully retieved, now they're missing the author and the amount of sats stacked.
In order to fix the error a new scraping session is needed.
The goal is:
- Collect the wrongfully scraped items
- Scrape them again with the proper setup and correct html tags

#### Comments

In [2]:
# Spot the wrongfully retrieved rows for the Comments table
conn = sqlite3.connect('../data/stacker_news.sqlite')

wrong_comments = """
SELECT *
FROM comments
WHERE Author=='None' AND Sats LIKE '@%';
"""

retrieve_wrong_comments = pd.read_sql(wrong_comments, conn)

retrieve_wrong_comments = pd.DataFrame(retrieve_wrong_comments)

conn.close()

In [3]:
retrieve_wrong_comments['ItemCode']

0           28
1           37
2           64
3         1312
4         1708
         ...  
1695    269426
1696    269543
1697    269544
1698    269546
1699    269547
Name: ItemCode, Length: 1700, dtype: object

#### Posts

In [4]:
# Spot the wrongfully retrieved rows for the Post table
conn = sqlite3.connect('../data/stacker_news.sqlite')

wrong_posts = """
SELECT *
FROM post
WHERE Author=='None' AND Sats LIKE '@%';
"""

retrieve_wrong_posts = pd.read_sql(wrong_posts, conn)

retrieve_wrong_posts = pd.DataFrame(retrieve_wrong_posts)

conn.close()

In [5]:
retrieve_wrong_posts['ItemCode']

0           31
1           34
2           35
3           36
4           92
         ...  
1126    268075
1127    268337
1128    268523
1129    268678
1130    269031
Name: ItemCode, Length: 1131, dtype: object

#### General list of items that must be scraped again

In [6]:
retrieve = list(retrieve_wrong_posts['ItemCode']) + list(retrieve_wrong_comments['ItemCode'])

In [7]:
# Queries for entry insertion in tables
insert_comment = """
INSERT OR IGNORE INTO comments (
    ItemCode,
    Sats,
    Boost,
    Comments,
    Author,
    Tag,
    Timestamp,
    CommentsItemCode
    ) values (?, ?, ?, ?, ?, ?, ?, ?)
"""

insert_post = """
INSERT OR IGNORE INTO post (
    Title,
    Category,
    ItemCode,
    Sats,
    Boost,
    Comments,
    Author,
    Tag,
    Timestamp,
    MainLink,
    BodyLinks,
    SatsReceivedComments,
    CommentsItemCode
    ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""

insert_exception = """
INSERT OR IGNORE INTO exceptions (
    RequestResult,
    ItemCode,
    Soup
    ) values (?, ?, ?)
"""

### Setup the fixing functions

In [None]:
NA = None

def extract_banner(page):

    # Produces a dict of banner items
    try:
        banner = page.find('div', class_='item_other__MjgP3')
    except:
        banner = NA

    partial_banner_data = [i.text for i in banner.find_all('span')]

    final_banner = {'sats': NA,
                    'boost': NA,
                    'comments': NA,
                    'author': NA,
                    'tag': NA,
                    'timestamp': NA,
                    }

    # Extract data in the banner
    username_pattern = r'@([a-zA-Z0-9]+)'
    for b in partial_banner_data:
        if "boost" in b:
            final_banner['boost'] = b
        elif "sats" in b or "sat" in b:
            final_banner['sats'] = b
        elif "@" in b:
            match = re.search(username_pattern, b).group(1)
            final_banner['author'] = match

    # Extract the data not extracted yet
    try:
        final_banner['comments'] = page.find('a', class_='text-reset position-relative').get_text()
    except:
        pass

    try:
        final_banner['tag'] = page.find('span', class_='item_newComment__HSNhq badge').get_text()
    except:
        pass

    # The try except is already in function definition
    final_banner['timestamp'] = item.get_timedate(page)

    return final_banner

In [10]:
#conn = sqlite3.connect('../data/stacker_news.sqlite')
#cur = conn.cursor()

for i in tqdm(retrieve[1:3]):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        if item.detect_item_type(i, soup) == 'comment':
            # Insert every new entry into a new row in the provided DB
            entry = (str(i),
                     str(comment.extract_banner(soup)['sats']),
                     str(comment.extract_banner(soup)['boost']),
                     str(comment.extract_banner(soup)['comments']),
                     str(comment.extract_banner(soup)['author']),
                     str(comment.extract_banner(soup)['tag']),
                     str(comment.extract_banner(soup)['timestamp']),
                     str(comment.extract_comment_item_code(soup))
                     )
            try:
                print(entry)
            except:
                print(f'Error while inserting the comment item {i} in the database')

        elif item.detect_item_type(i, soup) == 'link':
            # Appends every new profile to a csv file in the provided path
            entry = (str(link.extract_title(soup)),
                     str(item.detect_item_type(i, soup)),
                     str(i),
                     str(link.extract_banner(soup)['sats']),
                     str(link.extract_banner(soup)['boost']),
                     str(link.extract_banner(soup)['comments']),
                     str(link.extract_banner(soup)['author']),
                     str(link.extract_banner(soup)['tag']),
                     str(link.extract_banner(soup)['timestamp']),
                     str(link.extract_link(soup)),
                     str(link.extract_body_links(soup)),
                     str(link.extract_comment_stacked(soup)),
                     str(link.extract_comment_item_code(soup))
                     )
            try:
                print(entry)
            except:
                print(f'Error while inserting the link item {i} in the database')

        elif item.detect_item_type(i, soup) in ['discussion', 'poll', 'bounty']:
            entry = (str(discussion.extract_title(soup)),
                     str(item.detect_item_type(i, soup)),
                     str(i),
                     str(discussion.extract_banner(soup)['sats']),
                     str(discussion.extract_banner(soup)['boost']),
                     str(discussion.extract_banner(soup)['comments']),
                     str(discussion.extract_banner(soup)['author']),
                     str(discussion.extract_banner(soup)['tag']),
                     str(discussion.extract_banner(soup)['timestamp']),
                     None,
                     str(discussion.extract_body_links(soup)),
                     str(discussion.extract_comment_stacked(soup)),
                     str(discussion.extract_comment_item_code(soup))
                     )

            # Appends every new profile to a csv file in the provided path
            try:
                print(entry)
            except:
                print(f'Error while inserting the post item {i} in the database')

        if i % 1000 == 0:
            conn.commit()
            time.sleep(0.5)
            continue
    except:
        continue

# Final commit
#conn.commit()

# Close connection to DB
#cur.close()
#conn.close()

 50%|█████     | 1/2 [00:00<00:00,  1.05it/s]

("Why Altcoins aren't copying Taproot. Bitcoin Tech Talk #244", 'link', '34', '@satoshisuncle  14 Jun 2021', 'None', '0 comments', 'None', 'bitcoin', '2021-06-14 18:17:21', 'None', "['https://jimmysong.substack.com/p/why-altcoins-arent-copying-taproot']", 'None', '[]')


100%|██████████| 2/2 [00:01<00:00,  1.16it/s]

("Bitrefill's Work in El Salvador", 'link', '35', '@satoshisuncle  14 Jun 2021', 'None', '0 comments', 'None', 'bitcoin', '2021-06-14 18:23:46', 'None', "['https://twitter.com/bitrefill/status/1402624057120641036']", 'None', '[]')





## Profile scraping

**NB**: this code must be run after the end of the whole scraping activity because an `unique(author)` is needed in order to scrape all the user profiles in the forum. 

**The `unique(author)` must be the result of a `UNION ALL` between the tables.**

In [None]:
conn = sqlite3.connect('../data/stacker_news.sqlite')

query = """
SELECT DISTINCT Author
FROM (
    SELECT Author
    FROM comments
    UNION ALL
    SELECT Author
    FROM post
     );
"""

sql_query = pd.read_sql(query, conn)
result = pd.DataFrame(sql_query,
                      columns=['Author'])

conn.close()


In [None]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

sql_user = """
DROP TABLE IF EXISTS user;
CREATE TABLE user (
    User TEXT,
    TotalStacked TEXT,
    FirstItem TEXT,
    HatStreak TEXT,
    NumItems TEXT,
    PRIMARY KEY (User))
"""

cur.executescript(sql_user)

conn.commit()
conn.close()

In [None]:
insert_user = """
INSERT INTO user (
    User,
    TotalStacked,
    FirstItem,
    HatStreak,
    NumItems
    ) values (?, ?, ?, ?, ?)
"""

In [None]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

for i in tqdm(result['Author']):
    try:
        profile_data = user.get_profile(i)
        entry = (
            str(profile_data[0]),
            str(profile_data[1]),
            str(profile_data[2]),
            str(profile_data[3]),
            str(profile_data[4])
        )
        try:
            cur.execute(insert_user, entry)
        except:
            continue
    except:
        continue

conn.commit()
cur.close()
conn.close()