In [16]:
import glob
import os
import pandas as pd
from matplotlib import pyplot as pyplot
import json
from sklearn import feature_extraction as fe


First, we import the texts and source site from the files:

In [17]:
dir_path = "../data/pan"
txt_files = glob.glob("problem-*.txt", root_dir = dir_path)
json_files = glob.glob("truth-problem-*.json", root_dir=dir_path)

buffer_dict = []
for (txt_path, json_path) in zip(txt_files,json_files): # read all files and concatenate the relevant content into a dictionary
    txt_path = dir_path + "/" + txt_path
    json_path = dir_path + "/" + json_path
    with open(json_path) as json_f:
        json_dict = json.load(json_f)
        with open(txt_path) as txt_f:
            txt_content = txt_f.read()
            d= {
                "problem_txt": txt_content,
                "problem_site": json_dict["site"]
            }
            buffer_dict.append(d)



raw_df = pd.DataFrame(buffer_dict)
raw_df.sample(10)





Unnamed: 0,problem_txt,problem_site
2489,The authors of the book mentioned that problem...,linguistics
2192,"I'd second TopinFrassi's answer, that this is ...",philosophy
3533,"Yes, because Steve Pearce became the first pla...",astronomy
3192,If you feel any form of discomfort when you st...,philosophy
7975,"which is sometimes used for 26"" -> 700c conver...",mathoverflow.net
5999,"(emphasis mine) Is there a real ""Western novel...",workplace
7057,"I'm writing a story, based vaguely off of the ...",linguistics
4513,The spacing between 9 speed and 10 speed is co...,buddhism
5170,"As for your example, is there a reason that bo...",workplace
4995,"To simplify, if you find this account of B-the...",philosophy


Next, we compute our ngrams with sklearn countVectorizer (ex 1b):

In [18]:
vec = fe.text.CountVectorizer(ngram_range=(1,3))
mat = vec.fit_transform(raw_df["problem_txt"])

An n gram means that we pick out n words at a time for our bag of words model; *Bag of words* refers to a vector notation were we count the occurences for each  possible word (or gram) in our input text. Bag of words notation is very sparse, since most entries are 0 count entries.

Since we obtained the matrix, lets sum up the entries so we can see which ngrams are most common in our dataset:

In [20]:
summed_grams = mat.sum(axis=0)
words_freq = [(word, summed_grams[0,i]) for word, i in vec.vocabulary_.items()] # get an indexed list where each ngram is key and count of that ngram is val
words_freq = sorted(words_freq, key= lambda x: x[1], reverse=True)
print(words_freq[:25])

[('the', 897318), ('to', 489344), ('of', 416040), ('and', 352429), ('is', 332516), ('in', 278562), ('you', 260670), ('that', 259713), ('it', 227093), ('for', 159859), ('be', 150638), ('this', 141588), ('are', 127193), ('as', 124347), ('not', 114423), ('on', 114005), ('with', 108662), ('if', 108404), ('have', 103015), ('or', 99420), ('can', 96762), ('your', 94317), ('but', 93846), ('of the', 88540), ('an', 71277)]
