In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

# Visualization packages
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

# Append system path
sys.path = [p for p in sys.path if not p.endswith('../..')]  # Cleans duplicated '../..'
sys.path.insert(0, '../')  # This adds `src` to the path

from helpers import io, filters, constants
from analysis import util
from web_analysis import parse_robots
from web_analysis import robots_util


%load_ext autoreload
%autoreload 2

In [None]:
# Task 1 (user content, domain, purpose, illicit)
# Task 2 (paywall, ads, modalities)

In [None]:
# Load head and random for Task 1, Task 2
# Load URLToken class to grab heads or randoms
# Classify domain and purpose, including "Other"
# Compute score (chi squared) wrt each class/domain/etc.
# [Domain/Purpose (10), Paywall (1), Ads (1), Illicit (1), User Content (1), Modalities (3)] --> CS, % Top-100, Top-2k, Random.

# Stacked bar chart (y-axis: market, x-axis: num tokens, color: commercial). 
# Second chart: WildChat breakdown
# [Commercial, Market/Purpose, Num Tokens/Websites?]
# Categorize website into: []

In [2]:
url_token_lookup = robots_util.URLTokenLookup('pretrain_data/relevant_url_token_counts.csv') # 'c4', 'rf', 'dolma'
c4_url_to_counts = url_token_lookup.get_url_to_token_map("c4")

In [38]:
def extract_row_info(row):
    """
    Extract available information from a row of the DataFrame.
    Returns a dictionary with the available information.
    """
    row_info = {}

    # Extract information from columns if present
    for col in ["Website Issue", "User Content", "Terms of Use Link 1", "Terms of Use Link 2", "Terms of Use Link 3", "Terms of Use Link 4", "Terms of Use Link 5",
                "Paywall", "Content Modalities: Text", "Content Modalities: Images", "Content Modalities: Video", "Content Modalities: Audio",
                "Advertisements", "Website Issue User Content", "Website Description", "Content Domain I", "Content Domain II",
                "Content Domain III", "Content Domain (other)", "Type of service", "Type of service (Other)",
                "Sensitive content: Nudity", "Sensitive content: Pornography", "Sensitive content: Drugs", "Sensitive content: Violence",
                "Sensitive content: Illegal Activities", "Sensitive content: Hate Speech"]:
        if col in row:
            row_info[col] = row[col]

    return row_info

url_to_info = defaultdict(dict)
for fpath in io.listdir_nohidden("annotated_websites/Task 1") + io.listdir_nohidden("annotated_websites/Task 2"):
    # print(fpath)
    df = pd.read_csv(fpath).fillna("")
    overwrite_attempts = 0
    for _, row in df.iterrows():
        domain = row["Domain"]
        row_info = extract_row_info(row)

        if domain in url_to_info:
            overwrite_attempts += 1
            url_to_info[domain].update(row_info)
        else:
            url_to_info[domain] = row_info
    # print(f"Overwrite attempts for {fpath}: {overwrite_attempts}")





In [39]:
len(url_to_info)

6663

In [41]:
ks = list(url_to_info.keys())
url_to_info[ks[0]]

{'Website Issue': False,
 'User Content': '',
 'Website Description': 'Website containing celebrity news, gossip, and entertainment.',
 'Content Domain I': 'News',
 'Content Domain II': 'Entertainment',
 'Content Domain III': '',
 'Content Domain (other)': '',
 'Type of service': 'Periodicals',
 'Type of service (Other)': '',
 'Sensitive content: Nudity': False,
 'Sensitive content: Pornography': False,
 'Sensitive content: Drugs': False,
 'Sensitive content: Violence': False,
 'Sensitive content: Illegal Activities': False,
 'Sensitive content: Hate Speech': False,
 'Terms of Use Link 1': 'https://radaronline.com/s/terms-of-service/',
 'Terms of Use Link 2': 'https://radaronline.com/s/privacy-policy/',
 'Terms of Use Link 3': '',
 'Terms of Use Link 4': '',
 'Terms of Use Link 5': '',
 'Paywall': 'No',
 'Content Modalities: Text': True,
 'Content Modalities: Images': True,
 'Content Modalities: Video': True,
 'Content Modalities: Audio': False,
 'Advertisements': True}

In [42]:
domain_cols = ["Content Domain I", "Content Domain II","Content Domain III"] #, "Content Domain (other)"]
domain_ocol = ["Content Domain (other)"]
all_domain_cols = ["Content Domain I", "Content Domain II","Content Domain III", "Content Domain (other)"]
typ_cols = ["Type of service"]
typ_ocol = ["Type of service (Other)"]
def analyze_cols(url_to_info, cols, mapper):
    url_to_vals = defaultdict(list)
    for url, infos in url_to_info.items():
        if infos["Website Issue"] or not infos['User Content']:
            continue
        vals = set()
        for col in cols:
            if col in infos:
                vals.add(util.CONTENT_DOMAIN_INVERSE_MAPPING.get(row, "Other"))
        url_to_vals[url] = list(vals)
    return url_to_vals

analyze_cols(url_to_info, all_domain_cols, util.CONTENT_DOMAIN_INVERSE_MAPPING)
# analyze_cols(url_to_info, domain_ocol)
# print()
# analyze_cols(url_to_info, typ_cols)
# analyze_cols(url_to_info, typ_ocol)

TypeError: unhashable type: 'Series'

In [None]:
# Take in row.
# Map from each column value to relevant column category.
# lower and strip
# each row only gets a set of categories, not double counting

In [None]:
# 
# util.WEBSITE_SERVICE_INVERSE_MAPPING
    
    

In [22]:
a = {
    'News/Periodicals': [
        'Periodicals', 'Journal', 'News site providing video, audio, and stories',
        'News and lifestyle site providing video, audio, and stories', 'News site providing video and stories',
        'News and lifestyle site for Kitsap providing video, audio, and stories', 'News articles and reviews for biking',
        'Australian news and financial site providing video, audio, and stories', 'News and broadcasting site providing video, audio, and stories',
        'News articles about different social trends', 'News and articles for comic books',
        'News and technology site', 'Jamaica news, lifestyle, and entertainment articles', 'Daily tips',
        'News and entertainment site for pop culture', 'Radio Station', 'News articles and reviews for biking',
        'Electronic publishing sit containing a variety of content', 'Broadcasting and radio shows while communicating with listeners',
        'Contains a wide variety of different books, audio, and articles', 'Lifestyle magazine',
    ],
    'Organization/personal website': [
        'Company website', 'Find companies for different home services, cleaning, and other needs.', 'Personal website',
        'Business / Finance', 'Find and hire freelancers', 'Business directory', 'Organization', 'Charity',
    ],
    'Encyclopedia/Database': [
        'Encyclopedia/Database', 'Portal', 'Search for books', 'Religion-based library', 'Information',
        'Mail archive', 'Compilation of different sermons', 'Search engine', 'Search tool', 'Article directory',
    ],
    'Ecommerce': [
        'Ecommerce', 'E-Commerce', 'Buy and sell different books online'
    ],
    'Academic': [
        'Academic', 'Education', 'Diabetes medical information and articles',
        'USGPO articles and information'
    ],
    'Social Media/Forums': [
        'Social Media/Forums', 'social media', 'forum', 'forums', 'content and forum', 'Personal blog', 'Blog posts portal',
        'Also contains a forum', 'Forums and community for mothers', 'Contains user uploaded videos', 'Users rate different movies, tv, and video games',
        'Reviews', 'Review portal', 'Advice',
    ],
    'Government': [
        'Government',
    ],
    'Blog': [
        'Blog', 'blogs', 'personal blog', 'Personal blog', 'Fan fiction for fantasy', 'Poetry sharing, reading, and articles',
        'Lifestyle blog', 
    ],
}
b = {
    'Government': [
        'Government', 'Articles and comments about Australian government and politics'
    ],
    'Periodicals/News': [
        'TV Station', 'Middle East news articles', 'Telegraph India news articles', 'News site for different medical and life sciences articles',
        'India news articles and videos', 'News for tabletop and rpg games and community forums', 'Contains articles about current events in Utah',
        'Contains articles about current events in Sarasota', 'A news and lifestyle website for Cape Cod and state of Massachusetts',
        'Current events in Utah and news articles', 'Technology newsletter and articles'
    ],
    'Organization/personal website': [
        'Nonprofit website', 'NGO', 'personal site', 'Personal Blog', 'A member and association organization for bartenders',
        'A website where users can upload and download books and other publications', 'Non Profit'
    ],
    'Encyclopedia/Database': [
        'Encyclopedia', 'patents website', 'Contains a repository of different fantasy novels and books',
        'Compilation of books, historic and other academic works at Tufts', 'Compilation of tumblr blogs', 'Compilation of different medical research papers',
        'Oncology research papers and archives', 'Search', 'User can access coupon codes for different services', 'jobs database',

    ],
    'Ecommerce': [
        'independent sellers platform', 'booking platform', 'Travel website where users can book flights, transportation, hotels, and other accommodations',
        'Plant and gardening store', 'A website for a software company providing coding and other services','Find local apartments to rent'
         'Travel and Navigation Services', 
    ],
    'Academic': [
        'Educational study tools', 'A wide variety of technological articles and newsletters', 'Clinical and medical podcasts, blogs, news, and videos',
        'Financial reports, filings, news, and transcripts', 'Provides testing for dna, drugs, and other things'
    ],
    'Social Media/Forums': [
        'Forums', 'old forum no longer available', 'and forum', 'Forum for mobile developers and others to ask questions about phones and accessories.',
        'A forum for relationship and dating advice with some crafted articles', 'A forum where users can ask questions about the unity game engine',
        'A forum for obisidian games where users can discuss games and community announcements', 'Gaming community forum where users can comment on gaming videos and have discussions',
        'An internet archive capture of a gaming article', 'Video game, community forum, and wikipedia', 'Users can find local caregivers for different needs',
        'Users talk about different unsolved mysteries and related news', 'The 4chan forum', 'Video game community and forums',
        'Information about pregnancy and community forums', 'Forum for users to discuss lucid dreaming experiences', 'Stack exchange forum for physics questions and answers',
        'A forum for users to ask questions about metalworking and model engineering', 'Game news forum', 'review platform', 'Question/Answering portal',
        'Find user reviews for different products', 'Fan art website', 'Find local area businesses', 'law (student) network',

    ],
    'Blog': [
        'blog style website for food', 'Religious blog style website', 'Blog directory', 'Personal blog', 'Lifestyle portal', 'Lifestyle', 'Self help site'
    ],
}
c = {
    'Government': [
        'Government website containing agenda and publications', 'Government records', 'Governmental information tracking'
    ],
    'Periodicals/News': [
        'Local news articles for Colorado', 'News articles for Ghana', 'News articles, podcasts, and radio about current events',
        'News, weather, and general info', 'News website and blog', 'Video game news', 'Video game reviews',
        'News and Search Services', 'Books,ebooks,magazine', 'entertainment tv channel',
    ],
    'Organization/personal website': [
        'Information about the services offered by the manufacturing company', 'Company locations for firestone automotive parts and services',
        'investment management', 'Employee Training', 'Financial Planning', 'Restaurant', 'Road Running',
        'Camera and Security Systems', 'Insurance Services', 'Packaging and supplies services', 'Personal Injury Law'
        'Graphics and web designing', 'Chiropractic, Nutrition, therapies services', 'Property Selling', 'CharityAutism',
        'Finance advise & services', 'Financial information', 'Non-profit', 'Dental Health', 'school website', 'Tech storage solutions',

    ],
    'Encyclopedia/Database': [
        'Compilation of different podcasts', 'Database of sermons', 'Legal database', 'Legal Research Database', 'Wikipedia skin for easier reading',
        'Wiki site for creative writing', 'Wiki site for fan translation', 'online library', 'tool for web traffic overview'
    ],
    'Ecommerce': [
        'Bookings website for sailing trips', 'Travel bookings', 'Travel guide', 'Travel Guide', 'travel website with curated blogs',
        'Hair Treatment', 'Wholesale', 'Computer repairing', 'Designing and editing', 'Book self-publishing website', 'Business Apps',
        'Hotel',
    ],
    'Academic': [
        'Medical and biotech research for immunology', 'Journal,health'
    ],
    'Social Media/Forums': [
        'Forum for different rockstar videogames', 'Forum for users to discuss body building, fitness, and trt',
        'Forums for discussing the video game warframe', 'Forum for tech news, computers, and other forms of hardware',
        'Health articles for different chronic illnesses and community forums', 'Social media / forums', 'Social Media',
        'Social media book reviews website', 'Social Media for tech', 'Blogs/forum for building things',
        'Crowdfunding platform', 'Tech product discussion'
    ],
    'Blog': [
        'Music and information about a Dutch band', 'blog style website for food', 'Blog,Podcast', 'Climate Blog',
        'Entertainment Blog', 'Blog platform', 'blog ', 'Blogs', 'Archive of technology blog posts', 'blog site of a religion group', 

    ]
}
d = {
    'Government': [
        'sustainability website with news and tips'  # Sustainability often relates to government policy on environment.
    ],
    'Periodicals/News': [
        'non-official news media', 'News and reviews portal', 'Lifestyle articles portal', 'News and insights',
        'News collection', 'non-periodical news', 'Periodical', 'Mix of news and fan content', 'News site',
        'Opinion pieces on news stories', 'Indian consumer technology news', 'Argentinian racing content', 'Regional sports news',
        'religious magazine', 'Podcasts', 'Energy price news',
    ],
    'Organization/personal website': [
        'nonprofit organization', 'Organization website', 'Internation Organization website', 'Local sport league organization website',
        'Personal Website', 'personal webpage', 'personal website', 'Religious organization website', 'Nonprofit Website',
        'Sports website for a university', 'Artist website', 'Social/Human rights', 'Astrology website'
    ],
    'Encyclopedia/Database': [
        'Internet RFC/STD/FYI/BCP document archives', 'Law resources', '& Encyclopedia', 'Tracking GIT changes for GNU OS',
        'Collection of photography guides', 'Education/Knowledge'
    ],
    'Ecommerce': [
        'product website', 'Physical store directory', 'Paywalled business analysis', 'Coupon codes', 'gambling website',
        'Streaming services',
    ],
    'Academic': [
        'academic journal website', 'education/ courses', 'Academic opportunity search', 'Educational resources + review aggregator'
    ],
    'Social Media/Forums': [
        'online community', 'reddit posts', 'Social media/forums', 'Gaming website', 'Also forum/social', 'User forums', 'Eritrean analysis forum',
        'Review site', 'Review site / photo aggregator', 'Review',
    ],
    'Blog': [
        'blog site for gear review', 'novels website', 'Adult stories', 'Blog system', '(single-poster) Blog', 'Blog?', 'Technology blog',
        'Entertaiment blog', 'Entertainment biographies', 'Christian opinion website'
    ],
}

In [25]:
main_d = defaultdict(list)
for dd in [a,b,c,d]:
    for k in dd:
        main_d[k].extend(dd[k])

for k, v in main_d.items():
    main_d[k] = sorted(list(set(v)))


In [26]:
main_d

defaultdict(list,
            {'News/Periodicals': ['Australian news and financial site providing video, audio, and stories',
              'Broadcasting and radio shows while communicating with listeners',
              'Contains a wide variety of different books, audio, and articles',
              'Daily tips',
              'Electronic publishing sit containing a variety of content',
              'Jamaica news, lifestyle, and entertainment articles',
              'Journal',
              'Lifestyle magazine',
              'News and articles for comic books',
              'News and broadcasting site providing video, audio, and stories',
              'News and entertainment site for pop culture',
              'News and lifestyle site for Kitsap providing video, audio, and stories',
              'News and lifestyle site providing video, audio, and stories',
              'News and technology site',
              'News articles about different social trends',
              'News