In [1]:
import "bufio"
import "compress/bzip2"
import "encoding/json"
import "errors"
import "fmt"
import "io"
import "log"
import "os"
import "path"
import "path/filepath"
import "strings"
import "time"

import "github.com/ulikunitz/xz/lzma"
import "gonum.org/v1/gonum/stat"

In [3]:
func timeit(f func()) {
    const maxIter = 1e7
    const maxDuration = time.Duration(10e9)
    const scale = 2
    currMaxIter := 1
    elapsed := make([]float64, currMaxIter, maxIter)
    var iterStart time.Time
    start := time.Now()
    for i := 0; ; i++ {
        if i >= maxIter || i >= currMaxIter && time.Since(start) > maxDuration/scale {
            break
        }
        if i >= currMaxIter {
            currMaxIter *= scale
            if currMaxIter > maxIter {
                currMaxIter = maxIter
            }
            elapsed = elapsed[:currMaxIter]
        }
        iterStart = time.Now()
        f()
        elapsed[i] = time.Since(iterStart).Seconds()
    }
    
    mean := time.Duration(stat.Mean(elapsed, nil) * 1e9)
    stdDev := time.Duration(stat.StdDev(elapsed, nil) * 1e9)
    fmt.Printf("%s ± %s per loop (mean ± std. dev. of %d loops)\n", mean, stdDev, len(elapsed))
}

# Load data file

In [None]:
package redditngram

import (
	"bufio"
	"compress/bzip2"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log"
	"os"
	"path"
	"path/filepath"
	"time"

	"github.com/ulikunitz/xz/lzma"
)

const dateFormat = "2006-01"
const bz2Format = "RC_%04d-%02d.bz2"
const xzFormat = "RC_%04d-%02d.xz"

var dataStartDate, _ = time.Parse(dateFormat, "2005-12")
var xzStartDate, _ = time.Parse(dateFormat, "2017-12")

var RedditDataPath = getRedditDataPath()
var RedditCommentsPath = path.Join(RedditDataPath, "comments")

type Comment struct {
	Body        string `json:"body"`
	Subreddit   string `json:"subreddit"`
	SubredditId string `json:"subreddit_id"`
	CreatedUtc  int    `json:"created_utc"`
	Score       int    `json:"score"`
	ParentId    string `json:"parent_id"`
	LinkedId    string `json:"linked_id"`
}

func LoadRedditCommentsJson(year, month int) (<-chan Comment, error) {
	datapath, err := GetRedditCommentsLocal(year, month)
	if err != nil {
		return nil, err
	} else if _, err := os.Stat(datapath); os.IsNotExist(err) {
		return nil, err
	}

	comments := make(chan Comment)
	go func() {
		defer close(comments)
		fh, err := os.Open(datapath)
		if err != nil {
			log.Fatalln(err)
		}
		defer fh.Close()

		var reader io.Reader
		switch filepath.Ext(datapath) {
		case ".xz":
			reader, _ = lzma.NewReader(fh)
		case ".bz2":
			reader = bzip2.NewReader(fh)
		default:
			log.Panic("Only .xz and .bz2 are support. Given: ", datapath)
		}

		scanner := bufio.NewScanner(reader)
		var comment Comment
		for scanner.Scan() {
			json.Unmarshal([]byte(scanner.Text()), &comment)
			if comment.Body != "[deleted]" {
				comments <- comment
			}
		}
	}()
	return comments, nil
}

func GetRedditCommentsLocal(year, month int) (datapath string, err error) {
	date, _ := time.Parse(dateFormat, fmt.Sprintf("%04d-%02d", year, month))
	if validateRedditCommentsDate(date) {
		filename := getRedditCommentsFilename(date)
		datapath = path.Join(RedditCommentsPath, filename)
	} else {
		err = errors.New("date out of range.")
	}
	return
}

func validateRedditCommentsDate(date time.Time) (is_valid bool) {
	dataEndDate := time.Now().AddDate(0, -1, 0)
	is_valid = true
	if date.Before(dataStartDate) || date.After(dataEndDate) {
		is_valid = false
	}
	return
}

func getRedditCommentsFilename(date time.Time) (filename string) {
	format := bz2Format
	if date.Before(xzStartDate) {
		format = bz2Format
	}
	filename = fmt.Sprintf(format, date.Year(), date.Month())
	return
}

func getRedditDataPath() string {
	RedditDataPath := "~/reddit"
	if datapath := os.Getenv("REDDIT_DATA"); datapath != "" {
		RedditDataPath = datapath
	}
	return RedditDataPath
}

# Generate N-grams

In [14]:
a%5 == 0

true

In [15]:
func CountRedditCommentsUptoNgramStrs(year, month, order int) ([]map[string]int, error) {
    uptoNgramStrss, err := GenerateRedditCommentsUptoNgramStrs(year, month, order)
    if err != nil {
        return nil, err
    }
    uptoNgramCounts := make([]map[string]int, order)
    for i, _ := range uptoNgramCounts {
        uptoNgramCounts[i] = make(map[string]int)
    }
    lnCount := 0
    for uptoNgramStrs := range uptoNgramStrss {
        for i, igramStrs := range uptoNgramStrs {
            for igramStr := range igramStrs {
                uptoNgramCounts[i][igramStr] += 1
            }
        }
        lnCount++
        if lnCount %1000 == 0 {
            fmt.Println(lnCount)
        }
    }
    return uptoNgramCounts, nil
}


func GenerateRedditCommentsUptoNgramStrs(year, month, order int) (<-chan []<-chan string, error) {
    comments, err := LoadRedditCommentsJson(year, month)
    if err != nil {
        return nil, err
    }
    
    uptoNgramStrss := make(chan []<-chan string, order)
    go func() {
        for comment := range comments {
            tokens := String2Tokens(comment.body)
            uptoNgramStrss <- ExtractFilteredUptoNgramStrs(tokens, order, MaxRedditTokenLength)
        }
        close(uptoNgramStrss)
    }()
    return uptoNgramStrss, nil
}


func GenerateRedditCommentsNgramStrs(year, month, order int) (<-chan string, error) {
    comments, err := LoadRedditCommentsJson(year, month)
    if err != nil {
        return nil, err
    }
    
    ngramStrs := make(chan string)
    go func() {
        for comment := range comments {
            tokens := String2Tokens(comment.body)
            for ngramStr := range ExtractFilteredNgramStrs(tokens, order, MaxRedditTokenLength) {
                ngramStrs <- ngramStr
            }
        }
        close(ngramStrs)
    }()
    return ngramStrs, nil
}


func ExtractFilteredUptoNgramStrs(tokens []string, order int, maxTokLen int) []<-chan string {
    uptoNgramStrs := make([]<-chan string, order)
    for i := 0; i < order; i++ {
        uptoNgramStrs[i] = ExtractFilteredNgramStrs(tokens, i + 1, maxTokLen)
    }
    return uptoNgramStrs
}


func ExtractFilteredNgramStrs(tokens []string, order int, maxTokLen int) <-chan string {
    ngrams := ExtractNgrams(tokens, order)
    ngramStrs := make(chan string)
    go func() {
        for ngram := range ngrams {
            if !HasLongToken(ngram, maxTokLen) {
                ngramStrs <- Tokens2String(ngram)
            }
        }
        close(ngramStrs)
    }()
    return ngramStrs
}


func ExtractNgrams(tokens []string, order int) <-chan []string {
    ngrams := make(chan []string)
    length := len(tokens) - order + 1
    if length < 1 {
        close(ngrams)
        return ngrams
    }

    go func() {
        for i := 0; i < length; i++ {
            ngram := tokens[i:i+order]
            ngrams <- ngram
        }
        close(ngrams)
    }()
    return ngrams
}

func HasLongToken(tokens []string, maxTokLen int) bool {
    for _, tok := range tokens {
        if len(tok) > maxTokLen {
            return true
        }
    }
    return false
}

func String2Tokens(text string) []string {
    return strings.Fields(text)
}

func Tokens2String(tokens []string) string {
    return strings.Join(tokens, " ")
}

In [7]:
testString := strings.Repeat("asdf ", 1000)
testTokens := String2Tokens(testString)

In [8]:
cs := ExtractFilteredUptoNgramStrs(testTokens, 5, 25) 

In [11]:
ch, _ := GenerateRedditCommentsNgramStrs(2005, 12, 5)

In [None]:
counts, err := CountRedditCommentsUptoNgramStrs(2006, 12, 5)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000


In [14]:
len(counts[4])

45036

In [84]:
f := CreateExtractNgramStrsFunc(5, 25)

In [159]:
elapsed > time.Duration(1)

true

In [86]:
start := time.Now()
for x := 0; x < 1000; x++ {
    f(testString)
}
t := time.Now()
elapsed := t.Sub(start)
print(elapsed/1000)

1.556243ms