In [121]:
import pandas as pd
import pickle
from threading import Thread
import os, math

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
stemmer = SnowballStemmer("english")
cachedStopWords = stopwords.words("english")

In [122]:
##methods
            
def save_it_all(obj, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
    
def load_objects(file):
    with open(file, 'rb') as input:
        return pickle.load(input)
    
def vocabDict(vocab, doc, n): # handles empty word sets
    nk = 0
    # n = the number of word postions for this document type
    probability = math.log(nk + 1) - math.log(n + len(vocab))

    newDict = {"count": 0, "probability":probability}
    vdict = dict((el,newDict) for el in vocab)
    return vdict

def saveFrame(df, name):
    df.to_csv(name+".csv", index=False, sep=",", header=True)
    save_it_all(df, name+".pkl")
    
def getMaxClass(dictionary):
    result = {"class": None, "max": None, "values": None}
    maxVal = max(dictionary.values())
    for key, value in dictionary.items():
        if maxVal == value:
            result["class"] = key
            result["max"] = value
            result["values"] = dictionary
    return result

# niave bayes caclulation 
def classDict(df):
    classificationDict = {}
    for docType in df["Type"]:
        classificationDict[docType] = None
    return classificationDict

def process(trainedFrame, row, index, classProb, vocab):
    for docType in trainedFrame["Type"]:
        wordProbs = 0
        for word in row["Document"].split():
            if word in vocab:
                #print("Word: "+word + ", Prob: " + str(trainedFrame['WordCount'][trainedFrame["Type"] == docType].tolist()[0][word]["probability"]))
                wordProbs = wordProbs + math.log(trainedFrame['WordCount'][trainedFrame["Type"] == docType].tolist()[0][word]["probability"]) 
            else: # accounts for a new word we havent trained on 
                #print(trainedFrame['WordPositions'][trainedFrame["Type"] == docType].tolist()[0])
                wordProbs = wordProb + math.log((1)/(trainedFrame['WordPositions'][trainedFrame["Type"] == docType].tolist()[0]))
        classProb[docType] = wordProbs * trainedFrame['ClassProbability'][trainedFrame["Type"] == docType].tolist()[0]
    result = getMaxClass(classProb)
    # resultFrame["Predicted"].iloc[indexVal] 
    resultFrame.at[index, 'Predicted'] = result
    
    
    

In [123]:
# # load test data 
# # load in the test data
# fileName = "../forumTest.data"

# # main:
# data = read(fileName)
# sep_data, typesInfo, docsInfo = seperator(data)
# write(sep_data)

# dataFrameInitial = pd.DataFrame({"Type": typesInfo, "Document": docsInfo})

In [124]:
# dataFrameInitial.count()
# saveFrame(dataFrameInitial, "./raw_test_data.csv")

In [125]:
# load files
# load in docs you want to use:

## trained data info data set for use:
trainedFrame = load_objects("./raw_data_info_frame.pkl")

## vocabulary training set for use:
vocabulary = load_objects("./raw_vocabulary.pkl")

# test data
testFrame = pd.read_csv("./raw_test_data.csv", sep=",")

In [126]:
testFrame.head()

Unnamed: 0,Type,Document
0,atheism,re about the bible quiz answers in article hea...
1,atheism,re amusing atheists and agnostics in article t...
2,atheism,re yet more rushdie re islamic law jaeger buph...
3,atheism,re christian morality is in article vice ico t...
4,atheism,re after years can we say that christian moral...


In [127]:
trainedFrame.head()

Unnamed: 0,Type,Document,ClassProbability,WordPositions,WordCount
0,mideastpolitics,re islam borders was israel misisipi to ganges...,0.0499424,272488,"{'hepis': {'count': 0, 'probability': 2.888503..."
1,politics,re media horrified at perot investigating bush...,0.0411759,202625,"{'hepis': {'count': 0, 'probability': 3.618769..."
2,religion,re religion and homosexuality magarret magnus ...,0.0333835,129611,"{'hepis': {'count': 0, 'probability': 4.918282..."
3,mac,re nutek faces apple s wrath article read in a...,0.0511821,97182,"{'hepis': {'count': 0, 'probability': 5.851580..."
4,forsale,motorcycle wanted followup to kedz wpi wpi edu...,0.051802,69761,"{'hepis': {'count': 0, 'probability': 6.969952..."


In [128]:
# create my result frame:
resultFrame = testFrame.copy()
resultFrame["Predicted"] = None
saveFrame(resultFrame, "./raw_result_frame")

In [129]:
# start predicting:
jobs = []
for index, row in testFrame.iterrows():
    unique_id = index
    print(unique_id)
    classProb = classDict(trainedFrame)
    
    thread = Thread(target = process, args = (trainedFrame, row, index, classProb, vocabulary, ))
    jobs.append(thread)
    break
    
# start the model threads
countS = 0
for job in jobs:
    print("Started: " + str(countS))
    countS = countS + 1
    job.start()
# wait for all threads to finish
countE = 0
for job in jobs:
    print("Ended:  " + str(countE))
    countE = countE + 1
    job.join()


0
Started: 0
Ended:  0


In [130]:
resultFrame.at[0,"Predicted"]

{'class': 'graphics',
 'max': 0.0,
 'values': {'mideastpolitics': 0.0,
  'politics': 0.0,
  'religion': 0.0,
  'mac': 0.0,
  'forsale': 0.0,
  'hockey': 0.0,
  'mswindows': 0.0,
  'xwindows': 0.0,
  'medicine': 0.0,
  'electronics': 0.0,
  'space': 0.0,
  'autos': 0.0,
  'motorcycles': 0.0,
  'atheism': 0.0,
  'guns': 0.0,
  'cryptology': 0.0,
  'pc': 0.0,
  'christianity': 0.0,
  'baseball': 0.0,
  'graphics': 0.0}}