In [33]:
import sys
import os
import glob
import csv
import json
import pandas as pd
import requests
import codecs
import time
import gzip
from io import BytesIO
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer as ss
from nltk.stem import WordNetLemmatizer as wn
#from nltk.stem import LancasterStemmer as ls

In [8]:
def saveFile(fileName, content):
    file = open(fileName,"w") 
    file.write(content)
    file.close()

In [9]:
fnConfig = {
    'golf': 'g',
    'baseball': 'bsb',
    'basketball': 'bkb',
    'football': 'fb'
}

In [22]:
ccConfig = {
    "mlb": "baseball",
    "nfl": "football",
    "nba": "basketball",
    "golf": "golf"
}

In [12]:
def nytMain(query, folder):
    #query = "golf" or "baseball" or "basketball" or "football"
    df = pd.read_csv("urls/"+query+"-nyt-links.csv")
    #Get all the related urls for the given query
    result_urls = df.URL
    print("Total New York Times URLs : "+str(len(result_urls)))
    #Scrape the content of the urls recieved using nyt api
    counter = 0
    for url in result_urls:
        fileName = folder+"/nyt/"+query+"/"+str(counter)+".txt"
        response = requests.get(url)
        if response.status_code == 200:
            parser = BeautifulSoup(response.content, 'html.parser')
            #Search for section tag with name attribute as articleBody
            article = parser.find("section", {"name":"articleBody"})
            if article:
                #Get all the p tag texts
                paras = article.find_all("p")
                if len(paras) > 0:
                    content = ""
                    for p in paras:
                        content += str(p.text.encode('utf-8').strip(), 'utf-8')
                        content += "\n"
                    saveFile(fileName, content)
        counter += 1

In [14]:
nytMain("football", "new_data")
nytMain("baseball", "new_data")
nytMain("basketball", "new_data")

Total New York Times URLs : 634
Total New York Times URLs : 531
Total New York Times URLs : 750


In [15]:
def getCCUrls(domain, index_list):
    record_list = []
    for index in index_list:
        print("Current Index : " + index)
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-"+index+"-index?url="+domain+"&output=json"
        print("Current URL : " + cc_url)
        response = requests.get(cc_url)
        if response.status_code == 200:
            records = response.content.splitlines()
            for record in records:
                record_list.append(json.loads(record))
    print("# Records Found : " + str(len(record_list)))
    return record_list

In [16]:
def getHtmlDoc(record):
    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    raw_data = BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)
    data = f.read()
    response = ""
    if len(data):
        try:
            warc, header, response = data.strip().decode(encoding='utf-8', errors='strict').split('\r\n\r\n', 2)
        except:
            pass
        
    return response

In [23]:
def ccMain(query, folder, domains):
    index_list = ["2019-04","2019-09","2019-13"]
    record_list = list()
    for domain in domains:
        record_list += getCCUrls(domain, index_list)
    print("Total Common Crawl URLs : ", len(record_list))
    result_urls = list()
    temp = list()
    counter = 0
    for record in record_list:
        fileName = folder+"/cc/"+ccConfig[query]+"/"+str(counter)+".txt"
        url = urlparse(record['url'])
        if url.scheme == "http":
            continue
        urlString = url.geturl()
        strippedUrl = urlString[:urlString.find('?')]
        urlPath = url.path
        if urlPath not in temp:
            result_urls.append(strippedUrl)
            temp.append(urlPath)
            html_content = getHtmlDoc(record)
            parser = BeautifulSoup(html_content)
            article = parser.find("div", {"itemprop":"articleBody"})
            if article:
                paras = article.find_all("p", {"class":"p-text"})
                if len(paras) > 0:
                    content = ""
                    for p in paras:
                        content += str(p.text.encode('utf-8').strip(), 'utf-8')
                        content += "\n"
                    saveFile(fileName, content)
        counter += 1
    print(len(result_urls))

In [19]:
cQuery = "golf"
domains = ["usatoday.com/story/sports/"+cQuery+"/2019/01/*","usatoday.com/story/sports/"+cQuery+"/2019/02/*",
           "usatoday.com/story/sports/"+cQuery+"/2019/03/*"]
ccMain(cQuery, "new_data", domains)

Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/golf/2019/01/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/golf/2019/01/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/golf/2019/01/*&output=json
# Records Found : 96
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/golf/2019/02/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/golf/2019/02/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/golf/2019/02/*&output=json
# Records Found : 104
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-0

In [24]:
cQuery = "nfl"
domains = ["usatoday.com/story/sports/"+cQuery+"/2019/01/*","usatoday.com/story/sports/"+cQuery+"/2019/02/*",
           "usatoday.com/story/sports/"+cQuery+"/2019/03/*"]
ccMain(cQuery, "new_data", domains)

Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/nfl/2019/01/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/nfl/2019/01/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/nfl/2019/01/*&output=json
# Records Found : 309
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/nfl/2019/02/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/nfl/2019/02/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/nfl/2019/02/*&output=json
# Records Found : 101
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-ind

In [25]:
cQuery = "mlb"
domains = ["usatoday.com/story/sports/"+cQuery+"/2019/01/*","usatoday.com/story/sports/"+cQuery+"/2019/02/*",
           "usatoday.com/story/sports/"+cQuery+"/2019/03/*"]
ccMain(cQuery, "new_data", domains)

Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/mlb/2019/01/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/mlb/2019/01/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/mlb/2019/01/*&output=json
# Records Found : 215
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/mlb/2019/02/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/mlb/2019/02/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/mlb/2019/02/*&output=json
# Records Found : 87
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-inde

In [26]:
cQuery = "nba"
domains = ["usatoday.com/story/sports/"+cQuery+"/2019/01/*","usatoday.com/story/sports/"+cQuery+"/2019/02/*",
           "usatoday.com/story/sports/"+cQuery+"/2019/03/*"]
ccMain(cQuery, "new_data", domains)

Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/nba/2019/01/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/nba/2019/01/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/nba/2019/01/*&output=json
# Records Found : 150
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-index?url=usatoday.com/story/sports/nba/2019/02/*&output=json
Current Index : 2019-09
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-09-index?url=usatoday.com/story/sports/nba/2019/02/*&output=json
Current Index : 2019-13
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-13-index?url=usatoday.com/story/sports/nba/2019/02/*&output=json
# Records Found : 74
Current Index : 2019-04
Current URL : http://index.commoncrawl.org/CC-MAIN-2019-04-inde

In [37]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [38]:
def stemSentence(sentence):
    words = word_tokenize(sentence)
    final = ""
#     sb = ss("english")
    wnl = wn()
    for w in words: 
#         root = sb.stem(w)
        root = wnl.lemmatize(w)
        final += root
        final += " "
    return final

In [39]:
def processArticles(files, dest, fn):
    counter = 0
    for f in files:
        sample = open(f,"r")
        refinedFinal = ""
        print("Processing File # {}".format(counter), end="\r", flush=True)
        for line in sample.readlines():
            lowerLine = line.lower()
            noUrls = re.sub(r"http\S+", "", lowerLine)
            noUnderscore = noUrls.replace("_", "")
            #remove digits
            noNoLine = re.sub(r'\d+', '', noUnderscore)
            #remove punctuations
            words = re.findall(r'\w+', noNoLine, flags = re.UNICODE)# | re.LOCALE
            #remove stop words
            important_words = filter(lambda x: x not in stopwords.words('english') and x.isdigit() == False and x not in letters, words)
            refined = " ".join(important_words)
            #Get root words for the given words
            refinedFinal += stemSentence(refined)
            refinedFinal += "\n"
        saveFile(dest+fn+str(counter)+".txt", refinedFinal)
        counter += 1
        sample.close()

In [40]:
def processTweets(csvFile, dest):
    fn = csvFile.split(".csv")[0].split("/")[-1]
    df = pd.read_csv(csvFile)
    tweets = df.tweet_text
    refinedFinal = ""
    count = 0
    tempList = list()
    for tweet in tweets:
        print("Processing Tweet # {}".format(count), end="\r", flush=True)
        lowerLine = tweet.lower()
        noUrls = re.sub(r"http\S+", "", lowerLine)
        noUnderscore = noUrls.replace("_", "")
        #remove digits
        noNoLine = re.sub(r'\d+', '', noUnderscore)
        #remove punctuations
        words = re.findall(r'\w+', noNoLine, flags = re.UNICODE)# | re.LOCALE
        #remove stop words
        important_words = filter(lambda x: x not in stopwords.words('english') and x.isdigit() == False and x not in letters, words)
        refined = " ".join(important_words)
        #Get root words for the given words
        refinedStemmed = stemSentence(refined)
        if refinedStemmed in tempList:
            continue
        tempList.append(refinedStemmed)
        refinedFinal += refinedStemmed
        refinedFinal += "\n"
        count += 1
    saveFile(dest+fn+".txt", refinedFinal)

In [41]:
def processDataMain(directory, dest, ctype):
    if ctype == "twitter":
        processTweets(directory, dest)
    else:
        files=glob.glob(directory+"*.txt")
        fn = ctype+"_"+fnConfig[directory.split("/")[-2]]+"_"
        processArticles(files, dest, fn)

In [51]:
articleDir = "new_data/cc/golf/"
aDest = "data/cc/"
processDataMain(articleDir, aDest, "cc")

Processing File # 140