# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [6]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion, link, poll, bounty, comment
import csv
from tqdm import tqdm
import sqlite3

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [None]:
# Post items
file_path_post = "../data/post.csv"
row_head_post = ["Title",
                 "Category",
                 "Item code",
                 "Banner data",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]
    
with open(file_path_post, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_post)

In [None]:
# Comment items
file_path_comment = "../data/comment.csv"
row_head_comment = ["Item code",
                    "Banner data",
                    "Comments item code",
                    ]
    
with open(file_path_comment, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_comment)

In [None]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 200)


In [None]:
print(sampled_items)

## Csv and data structure

### Columns to be used for all the post item scraping

- Title
- Category
- Item code
- Banner data
- Main link
- Body links
- Sats received by comments
- Comment item code

### Columns to be used for the comment item scraping
- Item code
- Banner data
- Comment item code

**NB**-> the `comment item code` in Comment item table could even be deleted, we can just keep it in order to eventually see the relationship between the comment and the comments to the specific comment

In [None]:
# for i in tqdm(sampled_items):
#     try:
#         # Provided a string returns a bs4.BeautifulSoup object
#         url_posts = f'https://stacker.news/items/{i}'
#         response = requests.get(url_posts)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'html.parser')
#         
#         if item.detect_item_type(i, soup)=='comment':
#             entry = [str(i),
#                      comment.extract_banner(soup),
#                      comment.extract_comment_item_code(soup)
#                      ]
#             
#             # Appends every new profile to a csv file in the provided path
#             try:
#                 with open(file_path_comment, 'a', encoding='utf_8_sig', newline="") as csvfile:
#                     csvwriter = csv.writer(csvfile)
#                     csvwriter.writerow(entry)
#             except:
#                 print('Error while processing data')
# 
#         elif item.detect_item_type(i, soup)=='link':
#             entry = [link.extract_title(soup),
#                      item.detect_item_type(i,soup),
#                      str(i),
#                      link.extract_banner(soup),
#                      link.extract_link(soup),
#                      link.extract_body_links(soup),
#                      link.extract_comment_stacked(soup),
#                      link.extract_comment_item_code(soup)
#                      ]
#             # Appends every new profile to a csv file in the provided path
#             try:
#                 with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
#                     csvwriter = csv.writer(csvfile)
#                     csvwriter.writerow(entry)
#             except:
#                 print('Error while processing data')
#         
#         elif item.detect_item_type(i, soup)=='discussion':
#             entry = [discussion.extract_title(soup),
#                      item.detect_item_type(i,soup),
#                      str(i),
#                      discussion.extract_banner(soup),
#                      None,
#                      discussion.extract_body_links(soup),
#                      discussion.extract_comment_stacked(soup),
#                      discussion.extract_comment_item_code(soup)
#                      ]
#             
#             # Appends every new profile to a csv file in the provided path
#             try:
#                 with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
#                     csvwriter = csv.writer(csvfile)
#                     csvwriter.writerow(entry)
#             except:
#                 print('Error while processing data')
#                         
#         elif item.detect_item_type(i, soup)=='poll':
#             entry = [poll.extract_title(soup),
#                      item.detect_item_type(i, soup),
#                      str(i),
#                      poll.extract_banner(soup),
#                      None,
#                      poll.extract_body_links(soup),
#                      poll.extract_comment_stacked(soup),
#                      poll.extract_comment_item_code(soup)
#                      ]
#             # Appends every new profile to a csv file in the provided path
#             try:
#                 with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
#                     csvwriter = csv.writer(csvfile)
#                     csvwriter.writerow(entry)
#             except:
#                 print('Error while processing data')
#         
#         elif item.detect_item_type(i, soup)=='bounty':
#             entry = [bounty.extract_title(soup),
#                      item.detect_item_type(i, soup),
#                      str(i),
#                      bounty.extract_banner(soup),
#                      None,
#                      bounty.extract_body_links(soup),
#                      bounty.extract_comment_stacked(soup),
#                      bounty.extract_comment_item_code(soup)
#                      ]
#             # Appends every new profile to a csv file in the provided path
#             try:
#                 with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
#                     csvwriter = csv.writer(csvfile)
#                     csvwriter.writerow(entry)
#             except:
#                 print('Error while processing data')
#         
#     except:
#         continue


### Optimizing for the similarities between item post types

In the following chunck the functions applied are clustered for similarities, leading to three blocks:
- Comment
- Link
- Discussion/poll/bounty


In [None]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            entry = [str(i),
                     comment.extract_banner(soup),
                     comment.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_comment, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')

        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     item.detect_item_type(i,soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup) in ['discussion', 'poll', 'bounty']:
            entry = [discussion.extract_title(soup),
                     item.detect_item_type(i,soup),
                     str(i),
                     discussion.extract_banner(soup),
                     None,
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


--------------------------------------------------------------------------------
## Saving with SQLite 

In [19]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 40)

In [20]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

sql_comment = """
DROP TABLE IF EXISTS comments;
CREATE TABLE comments (
    ItemCode TEXT,
    BannerData TEXT,
    CommentsItemCode TEXT,
    PRIMARY KEY (ItemCode))
"""

sql_post = """
DROP TABLE IF EXISTS post;
CREATE TABLE post (
    Title TEXT,
    Category TEXT,
    ItemCode TEXT,
    BannerData TEXT,
    MainLink TEXT,
    BodyLinks TEXT,
    SatsReceivedComments TEXT,
    CommentsItemCode TEXT,
    PRIMARY KEY (ItemCode))
"""

cur.executescript(sql_comment)
cur.executescript(sql_post)

conn.commit()
conn.close()

In [21]:
insert_comment = """
INSERT INTO comments (
    ItemCode,
    BannerData,
    CommentsItemCode
    ) values (?, ?, ?)
"""

insert_post = """
INSERT INTO post (
    Title,
    Category,
    ItemCode,
    BannerData,
    MainLink,
    BodyLinks,
    SatsReceivedComments,
    CommentsItemCode
    ) values (?, ?, ?, ?, ?, ?, ?, ?)
"""

In [14]:
# conn = sqlite3.connect('../data/stacker_news.sqlite')
# cur = conn.cursor()
# 
# for i in tqdm(sampled_items):
#     try:
#         # Provided a string returns a bs4.BeautifulSoup object
#         url_posts = f'https://stacker.news/items/{i}'
#         response = requests.get(url_posts)
#         response.raise_for_status()
#         soup = BeautifulSoup(response.text, 'html.parser')
#         
#         if item.detect_item_type(i, soup)=='comment':
#             # Insert every new entry into a new row in the provided DB
#             entry = (str(i),
#                      str(comment.extract_banner(soup)),
#                      str(comment.extract_comment_item_code(soup))
#                      )
#             try:
#                 cur.execute(insert_comment, entry)
#             except:
#                 print('Error while processing data')
#     except:
#         continue
# 
# conn.commit()
# cur.close()
# conn.close()

100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


## SQLite process extended for the all script

Note that every entry in the sql statement is transformed into a string, only the `None` values (`NULL` in SQL) are left as is.

In [22]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            # Insert every new entry into a new row in the provided DB
            entry = (str(i),
                     str(comment.extract_banner(soup)),
                     str(comment.extract_comment_item_code(soup)),
                     )
            try:
                cur.execute(insert_comment, entry)
            except:
                print('Error while processing item ', i)

        elif item.detect_item_type(i, soup)=='link':
            # Appends every new profile to a csv file in the provided path
            entry = (str(link.extract_title(soup)),
                     str(item.detect_item_type(i,soup)),
                     str(i),
                     str(link.extract_banner(soup)),
                     str(link.extract_link(soup)),
                     str(link.extract_body_links(soup)),
                     str(link.extract_comment_stacked(soup)),
                     str(link.extract_comment_item_code(soup)),
                     )
            try:
                cur.execute(insert_post, entry)
            except:
                print('Error while processing item ', i)
        
        elif item.detect_item_type(i, soup) in ['discussion', 'poll', 'bounty']:
            entry = (str(discussion.extract_title(soup)),
                     str(item.detect_item_type(i,soup)),
                     str(i),
                     str(discussion.extract_banner(soup)),
                     None,
                     str(discussion.extract_body_links(soup)),
                     str(discussion.extract_comment_stacked(soup)),
                     str(discussion.extract_comment_item_code(soup)),
                     )
            
            # Appends every new profile to a csv file in the provided path
            try:
                cur.execute(insert_post, entry)
            except:
                print('Error while processing item ', i)
        
    except:
        continue

conn.commit()
cur.close()
conn.close()

100%|██████████| 40/40 [00:56<00:00,  1.42s/it]
