Author: Brian Erichsen Fagundes
MSD CS 6017 - Summer - 2024
Homework 3: Scraping and Regression

Part 1 Data Acquisiton

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import re


# Create arrays of data to be scraped
ranks = []
title_lengths = []
age_in_hours = []
points = []
comments_numbers = []

for i in range(5):
    url="http://news.ycombinator.com/?p=" + str(i+1)
# Access website contents
    with urllib.request.urlopen (url) as response:
        html=response.read()
        html=html.decode("utf-8")

# Saves html content into file
    with open ("hackernews" + str(i+1) + ".html", "w") as new_file:
        new_file.write(html)

# Parses html content into a soup
    soup = BeautifulSoup(html, 'html.parser')
    
# Scrapes rank data
    for post in soup.find_all(class_="rank"):
        rank = str(post.text)
        rank = rank.replace('.', '')
        ranks.append(int(rank))

# Loads length of title data
    for title in soup.find_all(class_="titleline"):
        title_lengths.append(len(title.text))

# Loads age data
    for age in soup.find_all(class_="age"):
        age_str =  str(age.text)
        age_str =  age_str.removesuffix(" hours ago")
        age_str = age_str.removesuffix(" hour ago")

        if " day ago" in age_str:
            age_in_hours.append(24)
        elif " days ago" in age_str:
            string = age_str.replace(" days ago", "")
            num = int(string)
            age_in_hours.append(num*24)
        elif " minute ago" in age_str or " minutes ago" in age_str:
            age_in_hours.append(0)
        else:
            age_in_hours.append(int(age_str))

    for subtext in soup.find_all(class_="subtext"):
        point = 0
        comments = 0
        for score in subtext.find_all(class_="score"):
            point = int(re.search(r'\d+', str(score.text)).group())

        for tag in subtext.find_all("a"):
            if tag.text.endswith("comments"):
                a_= str(tag.text)
                comments = int(re.search(r'\d+', a_).group())

        points.append(int(point))
        comments_numbers.append(int(comments))

data_frame = pd.DataFrame({
    "Rank" : ranks, "Title Length" : title_lengths, "Age (hours)" : age_in_hours,
    "Points" : points, "Comments" : comments_numbers
})
print(data_frame.head())


   Rank  Title Length  Age (hours)  Points  Comments
0     1            43            2      82        32
1     2            57            7     230        38
2     3            72           11     289       108
3     4            79           20     505       245
4     5            44            3      18         0


In [28]:
# save data into file so we don't have to load data each time
data_frame.to_csv("hacker_news_stories.csv", index=False)

Part 2 - Regression

In [4]:
import statsmodels.api as sm

dataframe = pd.read_csv("hacker_news_stories.csv")
x = data_frame['Points']
y = data_frame['Rank']

x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rank   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     3.276
Date:                Tue, 04 Jun 2024   Prob (F-statistic):             0.0723
Time:                        21:48:32   Log-Likelihood:                -776.42
No. Observations:                 150   AIC:                             1557.
Df Residuals:                     148   BIC:                             1563.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         79.9964      4.309     18.566      0.0

In [7]:
x = data_frame[['Comments', 'Age (hours)', 'Points', 'Title Length']]
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rank   R-squared:                       0.200
Model:                            OLS   Adj. R-squared:                  0.178
Method:                 Least Squares   F-statistic:                     9.081
Date:                Tue, 04 Jun 2024   Prob (F-statistic):           1.42e-06
Time:                        22:00:22   Log-Likelihood:                -761.30
No. Observations:                 150   AIC:                             1533.
Df Residuals:                     145   BIC:                             1548.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           49.5688     13.011      3.810   

In [8]:
from sklearn.preprocessing import PolynomialFeatures

# polynomial regression model: Rank ~ Comments + Comments^2 + Title Length + Title Length^2
poly = PolynomialFeatures(degree=2, include_bias=False)
x_poly = poly.fit_transform(data_frame[['Comments', 'Title Length']])
x = pd.DataFrame(x_poly, columns=poly.get_feature_names_out(['Comments', 'Title Length']))
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rank   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     2.227
Date:                Tue, 04 Jun 2024   Prob (F-statistic):             0.0548
Time:                        22:08:54   Log-Likelihood:                -772.48
No. Observations:                 150   AIC:                             1557.
Df Residuals:                     144   BIC:                             1575.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    79.03

In [9]:
import numpy as np

# inverse linear model
x = 1 / data_frame['Points']
x = sm.add_constant(x.replace([np.inf, -np.inf], 0))
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Rank   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     15.84
Date:                Tue, 04 Jun 2024   Prob (F-statistic):           0.000107
Time:                        22:15:17   Log-Likelihood:                -770.44
No. Observations:                 150   AIC:                             1545.
Df Residuals:                     148   BIC:                             1551.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         65.9446      4.148     15.898      0.0