# Data collection activities
All the data collection activities are automated using user defined functions retrievable in the folder `scripts`.

In [1]:
from bs4 import BeautifulSoup
import requests as requests
from scripts import item 
from scripts import discussion 
from scripts import link
from scripts import comment
from scripts import user
import csv
from tqdm import tqdm
import sqlite3
import pandas as pd

In [None]:
url_posts = f'https://stacker.news/items/21'
response = requests.get(url_posts)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

banner = soup.find('div', class_='item_other__MjgP3')

partial_banner_data = [i.text for i in banner.find_all('span')]

print(partial_banner_data)

# Item scraping
The following code saves items data into a csv file, provided a range of item codes fixed by the operator.

First of all we need to initialize all the files for data collection

## Initialization of csv files

In [None]:
# Post items
file_path_post = "../data/post.csv"
row_head_post = ["Title",
                 "Category",
                 "Item code",
                 "Sats",
                 "Boost",
                 "Comments",
                 "Author",
                 "Tag",
                 "Timestamp",
                 "Main link",
                 "Body links",
                 "Sats received by comments",
                 "Comments item code",
                 ]
    
with open(file_path_post, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_post)

In [None]:
# Comment items
file_path_comment = "../data/comment.csv"
row_head_comment = ["Item code",
                    "Sats",
                    "Boost",
                    "Comments",
                    "Author",
                    "Tag",
                    "Timestamp",
                    "Comments item code",
                    ]
    
with open(file_path_comment, 'w', encoding='utf_8_sig', newline="") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(row_head_comment)

In [None]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,200000)], 10)


In [None]:
print(sampled_items)

## Csv and data structure

### Columns to be used for all the post item scraping

- Title
- Category
- Item code
- Banner data
- Main link
- Body links
- Sats received by comments
- Comment item code

### Columns to be used for the comment item scraping
- Item code
- Banner data
- Comment item code

**NB**-> the `comment item code` in Comment item table could even be deleted, we can just keep it in order to eventually see the relationship between the comment and the comments to the specific comment

### Optimizing for the similarities between item post types

In the following chunck the functions applied are clustered for similarities, leading to three blocks:
- Comment
- Link
- Discussion/poll/bounty


In [None]:
for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            entry = [str(i),
                     comment.extract_banner(soup)['sats'],
                     comment.extract_banner(soup)['boost'],
                     comment.extract_banner(soup)['comments'],
                     comment.extract_banner(soup)['author'],
                     comment.extract_banner(soup)['tag'],
                     comment.extract_banner(soup)['timestamp'],
                     comment.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_comment, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')

        elif item.detect_item_type(i, soup)=='link':
            entry = [link.extract_title(soup),
                     item.detect_item_type(i,soup),
                     str(i),
                     link.extract_banner(soup)['sats'],
                     link.extract_banner(soup)['boost'],
                     link.extract_banner(soup)['comments'],
                     link.extract_banner(soup)['author'],
                     link.extract_banner(soup)['tag'],
                     link.extract_banner(soup)['timestamp'],
                     link.extract_link(soup),
                     link.extract_body_links(soup),
                     link.extract_comment_stacked(soup),
                     link.extract_comment_item_code(soup)
                     ]
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
        elif item.detect_item_type(i, soup) in ['discussion', 'poll', 'bounty']:
            entry = [discussion.extract_title(soup),
                     item.detect_item_type(i,soup),
                     str(i),
                     discussion.extract_banner(soup)['sats'],
                     discussion.extract_banner(soup)['boost'],
                     discussion.extract_banner(soup)['comments'],
                     discussion.extract_banner(soup)['author'],
                     discussion.extract_banner(soup)['tag'],
                     discussion.extract_banner(soup)['timestamp'],
                     None,
                     discussion.extract_body_links(soup),
                     discussion.extract_comment_stacked(soup),
                     discussion.extract_comment_item_code(soup)
                     ]
            
            # Appends every new profile to a csv file in the provided path
            try:
                with open(file_path_post, 'a', encoding='utf_8_sig', newline="") as csvfile:
                    csvwriter = csv.writer(csvfile)
                    csvwriter.writerow(entry)
            except:
                print('Error while processing data')
        
    except:
        continue


--------------------------------------------------------------------------------
## Saving with SQLite 

In [None]:
# Try to scrape 150 different items rather than in 'progressive item mode'
from random import sample

sampled_items = sample([*range(1,250000)], 100)

In [None]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

sql_comment = """
DROP TABLE IF EXISTS comments;
CREATE TABLE comments (
    ItemCode TEXT,
    Sats TEXT,
    Boost TEXT,
    Comments TEXT,
    Author TEXT,
    Tag TEXT,
    Timestamp TEXT,
    CommentsItemCode TEXT,
    PRIMARY KEY (ItemCode))
"""

sql_post = """
DROP TABLE IF EXISTS post;
CREATE TABLE post (
    Title TEXT,
    Category TEXT,
    ItemCode TEXT,
    Sats TEXT,
    Boost TEXT,
    Comments TEXT,
    Author TEXT,
    Tag TEXT,
    Timestamp TEXT,
    MainLink TEXT,
    BodyLinks TEXT,
    SatsReceivedComments TEXT,
    CommentsItemCode TEXT,
    PRIMARY KEY (ItemCode))
"""

sql_exception = """
DROP TABLE IF EXISTS exceptions;
CREATE TABLE exceptions(
    RequestResult TEXT,
    ItemCode TEXT,
    Soup TEXT,
    PRIMARY KEY (ItemCode))
"""

cur.executescript(sql_comment)
cur.executescript(sql_post)
cur.executescript(sql_exception)

conn.commit()
conn.close()

In [None]:
insert_comment = """
INSERT OR IGNORE INTO comments (
    ItemCode,
    Sats,
    Boost,
    Comments,
    Author,
    Tag,
    Timestamp,
    CommentsItemCode
    ) values (?, ?, ?, ?, ?, ?, ?, ?)
"""

insert_post = """
INSERT OR IGNORE INTO post (
    Title,
    Category,
    ItemCode,
    Sats,
    Boost,
    Comments,
    Author,
    Tag,
    Timestamp,
    MainLink,
    BodyLinks,
    SatsReceivedComments,
    CommentsItemCode
    ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""

insert_exception = """
INSERT OR IGNORE INTO exceptions (
    RequestResult,
    ItemCode,
    Soup
    ) values (?, ?, ?)
"""

Note that every entry in the sql statement is transformed into a string, only the `None` values (`NULL` in SQL) are left as is.

In [None]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

for i in tqdm(sampled_items):
    try:
        # Provided a string returns a bs4.BeautifulSoup object
        url_posts = f'https://stacker.news/items/{i}'
        response = requests.get(url_posts)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        if item.detect_item_type(i, soup)=='comment':
            # Insert every new entry into a new row in the provided DB
            entry = (str(i),
                     str(comment.extract_banner(soup)['sats']),
                     str(comment.extract_banner(soup)['boost']),
                     str(comment.extract_banner(soup)['comments']),
                     str(comment.extract_banner(soup)['author']),
                     str(comment.extract_banner(soup)['tag']),
                     str(comment.extract_banner(soup)['timestamp']),
                     str(comment.extract_comment_item_code(soup))
                     )
            try:
                cur.execute(insert_comment, entry)
            except:
                print(f'Error while inserting the comment item {i} in the database')

        elif item.detect_item_type(i, soup)=='link':
            # Appends every new profile to a csv file in the provided path
            entry = (str(link.extract_title(soup)),
                     str(item.detect_item_type(i,soup)),
                     str(i),
                     str(link.extract_banner(soup)['sats']),
                     str(link.extract_banner(soup)['boost']),
                     str(link.extract_banner(soup)['comments']),
                     str(link.extract_banner(soup)['author']),
                     str(link.extract_banner(soup)['tag']),
                     str(link.extract_banner(soup)['timestamp']),
                     str(link.extract_link(soup)),
                     str(link.extract_body_links(soup)),
                     str(link.extract_comment_stacked(soup)),
                     str(link.extract_comment_item_code(soup))
                     )
            try:
                cur.execute(insert_post, entry)
            except:
                print(f'Error while inserting the link item {i} in the database')
        
        elif item.detect_item_type(i, soup) in ['discussion', 'poll', 'bounty']:
            entry = (str(discussion.extract_title(soup)),
                     str(item.detect_item_type(i,soup)),
                     str(i),
                     str(discussion.extract_banner(soup)['sats']),
                     str(discussion.extract_banner(soup)['boost']),
                     str(discussion.extract_banner(soup)['comments']),
                     str(discussion.extract_banner(soup)['author']),
                     str(discussion.extract_banner(soup)['tag']),
                     str(discussion.extract_banner(soup)['timestamp']),
                     None,
                     str(discussion.extract_body_links(soup)),
                     str(discussion.extract_comment_stacked(soup)),
                     str(discussion.extract_comment_item_code(soup))
                     )
            
            # Appends every new profile to a csv file in the provided path
            try:
                cur.execute(insert_post, entry)
            except:
                print(f'Error while inserting the post item {i} in the database')
                
        if i%5000==0:
            conn.commit()
            continue
    except:
        try:
            exception_entry = (
                str(response),
                str(i),
                str(soup)
            )
            
            cur.execute(insert_exception, exception_entry)
            
        except:
            continue

conn.commit()
cur.close()
conn.close()

Now let's try to save SQL query results to a python object in order to further manipulate

# Profile scraping

**NB**: this code must be run after the end of the whole scraping activity because an `unique(author)` is needed in order to scrape all the user profiles in the forum. 

**The `unique(author)` must be the result of a `UNION ALL` between the tables.**

In [2]:
conn = sqlite3.connect('../data/stacker_news.sqlite')

query = """
SELECT DISTINCT Author
FROM (
    SELECT Author
    FROM comments
    UNION ALL
    SELECT Author
    FROM post
     );
"""

sql_query = pd.read_sql(query, conn)
result = pd.DataFrame(sql_query,
                      columns=['Author'])

conn.close()


Then this list can be used as an input for the profile scraping script in order to retrieve all the user profiles and insert them into a new `TABLE`.

Now we need to add a new `table` for the Profiles

In [3]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

sql_user = """
DROP TABLE IF EXISTS user;
CREATE TABLE user (
    User TEXT,
    TotalStacked TEXT,
    FirstItem TEXT,
    HatStreak TEXT,
    NumItems TEXT,
    PRIMARY KEY (User))
"""

cur.executescript(sql_user)

conn.commit()
conn.close()

In [4]:
insert_user = """
INSERT INTO user (
    User,
    TotalStacked,
    FirstItem,
    HatStreak,
    NumItems
    ) values (?, ?, ?, ?, ?)
"""

In [5]:
for u in result['Author'][1:10]:
    user.get_profile(u)

In [6]:
conn = sqlite3.connect('../data/stacker_news.sqlite')
cur = conn.cursor()

for i in tqdm(result['Author']):
    try:
        profile_data = user.get_profile(i)
        entry = (
            str(profile_data[0]),
            str(profile_data[1]),
            str(profile_data[2]),
            str(profile_data[3]),
            str(profile_data[4])
        )
        try:
            cur.execute(insert_user, entry)
        except:
            continue
    except:
        continue

conn.commit()
cur.close()
conn.close()

100%|██████████| 5806/5806 [1:02:07<00:00,  1.56it/s]
