# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [17]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import user, item, discussion, link, poll, bounty, comment
import csv
from tqdm import tqdm 

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [18]:
# Discussion items
file_path_discussion = "../data/discussion.csv"
row_head_discussion = ["Title",
            "Item code",
            "Banner data",
            "Body links",
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_discussion, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_discussion)

In [19]:
# Link items
file_path_link = "../data/link.csv"
row_head_link = ["Title",
                 "Item code",
                 "Banner data",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_link, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_link)


In [20]:
# Poll items
file_path_poll = "../data/poll.csv"
row_head_poll = ["Title",
                 "Item code",
                 "Banner data",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_poll, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_poll)


In [21]:
# Bounty items
file_path_bounty = "../data/bounty.csv"
row_head_bounty = ["Title",
                 "Item code",
                 "Banner data",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]

with open(file_path_bounty, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_bounty)

In [22]:
# Comment items
file_path_comment = "../data/comment.csv"
row_head_comment = ["Title",
            "Item code",
            "Banner data",
            "Body links",
            "Sats received by comments",
            "Comments item code",
            ]
    
with open(file_path_comment, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_comment)

In [23]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 200)


In [24]:
print(sampled_items)

[195186, 131987, 96228, 168486, 34084, 61040, 13642, 34997, 70000, 161797, 11857, 131108, 100009, 102030, 22342, 52330, 97406, 83214, 73305, 72146, 179040, 121389, 4015, 82212, 100508, 115818, 188635, 37610, 167215, 83205, 6755, 114225, 155042, 30247, 44014, 51285, 182029, 181169, 47373, 192326, 13699, 160947, 191113, 2236, 129422, 130605, 123085, 33306, 104316, 8843, 27591, 117498, 109648, 62999, 158888, 164721, 157447, 65616, 189758, 137285, 97859, 138452, 53172, 93220, 74164, 179856, 196339, 194106, 193659, 161757, 21166, 90805, 149857, 49875, 173230, 134527, 171880, 194973, 97058, 45918, 63601, 64447, 42739, 148356, 28329, 158244, 100576, 147319, 72696, 16178, 188402, 37412, 50732, 193589, 73401, 83306, 150111, 4658, 126444, 70217, 188921, 63667, 102548, 132079, 90343, 39999, 24475, 112846, 133053, 35520, 50306, 148246, 50433, 30547, 5528, 164571, 102380, 135640, 77478, 119182, 180472, 155062, 167194, 97836, 132406, 142470, 89112, 4587, 101936, 75502, 75923, 67109, 199127, 83248, 1

## First attempt

In [25]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            entry = [None,
                     str(i),
                     comment.extract_banner(soup),
                     comment.extract_body_links(soup),
                     comment.extract_comment_stacked(soup),
                     comment.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_comment, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_link, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
                
        elif item.detect_item_type(i, soup)=='poll':
            entry = [poll.extract_title(soup),
                     str(i),
                     poll.extract_banner(soup),
                     poll.extract_body_links(soup),
                     poll.extract_comment_stacked(soup),
                     poll.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_poll, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='bounty':
            entry = [bounty.extract_title(soup),
                     str(i),
                     bounty.extract_banner(soup),
                     bounty.extract_body_links(soup),
                     bounty.extract_comment_stacked(soup),
                     bounty.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_bounty, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


100%|██████████| 200/200 [02:17<00:00,  1.46it/s]


## Fixing the resulting csv and the data structure

### Columns to be used for all the post item scraping

- Title
- Item code
- Banner data
- Main link
- Body links
- Sats received by comments
- Comment item code

### Columns to be used for the comment item scraping
- Item code
- Banner data
- Comment item code

**NB**-> the `comment item code` in Comment item table could even be deleted, we can just keep it in order to eventually see the relationship between the comment and the comments to the specific comment

In [None]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            entry = [None,
                     str(i),
                     comment.extract_banner(soup),
                     comment.extract_body_links(soup),
                     comment.extract_comment_stacked(soup),
                     comment.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_comment, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='discussion':
            entry = [discussion.extract_title(soup),
                     str(i),
                     discussion.extract_banner(soup),
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_discussion, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     str(i),
                     link.extract_banner(soup),
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_link, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
                
        elif item.detect_item_type(i, soup)=='poll':
            entry = [poll.extract_title(soup),
                     str(i),
                     poll.extract_banner(soup),
                     poll.extract_body_links(soup),
                     poll.extract_comment_stacked(soup),
                     poll.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_poll, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup)=='bounty':
            entry = [bounty.extract_title(soup),
                     str(i),
                     bounty.extract_banner(soup),
                     bounty.extract_body_links(soup),
                     bounty.extract_comment_stacked(soup),
                     bounty.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_bounty, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue
