In [1]:
import csv
from unicodedata import category
import re
import string
import random
import math

# ----------------------------------------------------------------------
#  Datasets functions
# ----------------------------------------------------------------------
def delete_strings_symbols(s):
    NoSpecialChars = s.translate ({ord(c): (" "+ c +" ") for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"})
    return NoSpecialChars 

def remove_symbols_in_string(text,newsign=''):
    signtext = string.punctuation + newsign 
    signrepl = '@'*len(signtext) 
    signtable = str.maketrans(signtext,signrepl) 
    return text.translate(signtable).replace('@','') 

def read_tsv_by_line(fileName):
    '''Read .tsv files line by line and return the lines as a list''' 
    '''The return lines' element are a list which contains a string '''
    lines = []
    with open(fileName,'r', encoding="utf8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\n')
        for row in reader:
            lines.append(row)
    return lines

def split_line_by_tab(tsvLines):
    '''Split the class in the training datasets'''
    '''Return the lines which ele[0] is CLASS, ele[1] is SMS_MESSAGES '''
    newLines = []
    EMPTY = ""
    for ele in tsvLines:
        if len(ele) == 1:
            splitEle = ele[0].split("\t")
            newLines.append(splitEle)
        else:
            splitEle = ele.split("\t")
            newLines.append(splitEle)
    return newLines

def remove_symbols_by_line(Lines):
    noSymbolLines = []
    for ele in Lines:
        ele[1] = delete_strings_symbols(ele[1])
        noSymbolLines.append(ele)
    return noSymbolLines

def update_dictionary_by_line(line,dictionary):
    '''Update the dictionary by line'''
    splitedLine =line.split(" ")
    for w in splitedLine:
        if w in dictionary.keys():
            dictionary[w] += 1
        else:
            dictionary[w] = 1
    return dictionary

def get_whole_file_dictionary(lines):
    dictionary = {}
    for ele in lines:
        dictionary = update_dictionary_by_line(ele[1],dictionary)
    return dictionary

def read_file_and_return_each_line_and_dictionary(fileName):
    '''Read a file and return the dictionary of the whole file'''
    # read file and return the context by line
    rawLines = read_tsv_by_line(fileName)
    # delete tab in each line
    splitedLines = split_line_by_tab(rawLines)
    # delete symbols in each line
    splitedLinesNoSymbol = remove_symbols_by_line(splitedLines)
    # get the dictionary
    dictionary = {}
    dictionary = get_whole_file_dictionary(splitedLinesNoSymbol)
    return splitedLinesNoSymbol, dictionary

def each_line_to_vector(line,dictKeysList):
    '''Return the FULL format bag-of-words vector for a line'''
    line = line.split(" ")
    dictLength = len(dictKeysList)
    vectorTemplate = [0] * dictLength
    lineVector = vectorTemplate
    for w in line:
        if w in dictKeysList: 
            INDEX = dictKeysList.index(w)
            if INDEX:
                lineVector[INDEX] += 1
    return lineVector

def lines_to_vectors_Training(lines,dictionary):
    linesVectors = []
    listDictionary = list(dictionary.keys())
    for ele in lines:
        theVector = each_line_to_vector(ele[1],listDictionary) 
        if ele[0] == "ham":
            theVector.append(0)
        else:
            theVector.append(1)
        linesVectors.append(theVector)
    return linesVectors

def write_result_in_csv(predictions,csvfile):
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        count = 1 
        for val in predictions:
            if val == 0:
                writer.writerow([count,"ham"])
                count += 1
            else:
                writer.writerow([count,"spam"])
                count += 1
                
# ----------------------------------------------------------------------
# NBC functions
# ----------------------------------------------------------------------

def split_dataset(dataset, splitRatio):
    '''Split the trianing dataset by the splitRatio'''
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
	    index = random.randrange(len(copy))
	    trainSet.append(copy.pop(index))
    return [trainSet, copy]

def separate_by_class(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = {}
	for classValue, instances in separated.items():
		summaries[classValue] = summarize(instances)
	return summaries

def calculate_probability(x, mean, stdev):
    if mean == 0 or stdev == 0:
        return 1
    else:
        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
        return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateProbability_Bak(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculate_class_probabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.items():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculate_probability(x, mean, stdev)
	return probabilities

def predict(summaries, inputVector):
	probabilities = calculate_class_probabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.items():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def get_predictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def get_accuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][-1] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def sms_dataset(splitRatio = 0.67):
    trainFileLines,trainFileDict = read_file_and_return_each_line_and_dictionary("sms_train.tsv")
    print("The train lines acount is: %d" %len(trainFileLines))
    print("The train dictionary words acount is: %d" %len(trainFileDict))
    trainVectors = lines_to_vectors_Training(trainFileLines,trainFileDict)
    trainTrainingSet, trainTestSet = split_dataset(trainVectors,splitRatio) 
    summaries = summarize_by_class(trainTrainingSet)
    print("The train class ham summaries acount is: %d" %len(summaries[0]))
    print("The train class spam summaries acount is: %d" %len(summaries[1]))
    trainPredictions = get_predictions(summaries, trainTestSet)
    trainAccuracy = get_accuracy(trainTestSet, trainPredictions)
    print("The Accuracy is: %f" %trainAccuracy)
    testFileLines,testFileDict = read_file_and_return_each_line_and_dictionary("sms_test.tsv")
    testVector = lines_to_vectors_Training(testFileLines, trainFileDict) 
    newSummaries = summarize_by_class(trainVectors)
    testPredictions = get_predictions(newSummaries,testVector)
    csvFileName = "XUE_sms_notSparse.csv"
    write_result_in_csv(testPredictions,csvFileName) 
    

#-------------------------------------------------------------------------------------------
# Vector Sparse Format
#-------------------------------------------------------------------------------------------

def each_line_to_vector_sparseFormat(line,dictKeysList):
    '''Return the FULL format bag-of-words vector for a line'''
    line = line.split(" ")
    dictLength = len(dictKeysList)
    vectorTemplate = {} # using dictionary
    lineVector = vectorTemplate
    for w in line:
        if w in dictKeysList: 
            INDEX = dictKeysList.index(w)
            if INDEX in lineVector.keys():
                lineVector[INDEX] += 1
            else:
                lineVector[INDEX] = 1
    return lineVector
                

def lines_to_vectors_Training_sparseFormat(lines,dictionary):
    linesVectors = []
    listDictionary = list(dictionary.keys())
    for ele in lines:
        theVector = each_line_to_vector_sparseFormat(ele[1],listDictionary) 
        if ele[0] == "ham":
            theVector = [theVector, 0]
        else:
            theVector = [theVector, 1]
        linesVectors.append(theVector)
    return linesVectors


def mean_by_postions(posList,Count):
    mean = sum(posList)/float(Count)
    return mean
 

def stdev_by_positions(posList,Count):
    avg =mean_by_postions(posList,Count)
    Zeros = pow(0-avg,2) * (Count-len(posList))
    NotZeros = sum([pow(x-avg,2) for x in posList])
    variance = (Zeros + NotZeros)/float(Count)
    return math.sqrt(variance)
    

def summarize_sparseFormat_by_class(dataset):
    vectorCountByClass = len(dataset)
    summaries_by_class = {}
    for vector in dataset:
        sparseVector = vector[0]
        keysSparseVector = sparseVector.keys()
        for pos in keysSparseVector:
            if pos in summaries_by_class.keys():
                summaries_by_class[pos].append(sparseVector[pos])
            else:
                summaries_by_class[pos] = [sparseVector[pos]]
    stats_by_class = {}
    for pos in summaries_by_class.keys():
        thePosMean = mean_by_postions(summaries_by_class[pos],vectorCountByClass)
        thePosStdev = stdev_by_positions(summaries_by_class[pos],vectorCountByClass)
        stats_by_class[pos] = (thePosMean,thePosStdev)
    return stats_by_class

def summarize_by_class_sparseFormat(dataset):
	separated = separate_by_class(dataset)
	summaries = {}
	for classValue, instances in separated.items():
		summaries[classValue] = summarize_sparseFormat_by_class(instances)
	return summaries

def calculate_probability_sparseFormat(x, Mean, Stdev):
    if Mean == 0 or Stdev == 0:
        return 1
    else:
        exponent = math.exp(-(math.pow(x-Mean,2)/(2*math.pow(Stdev,2))))
        return (1 / (math.sqrt(2*math.pi) * Stdev)) * exponent
    
def calculate_class_probabilities_sparseFormat(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in classSummaries.keys():
            mean, stdev = classSummaries[i]
            if i in inputVector[0].keys():
                x = inputVector[0][i]
                probabilities[classValue] *= calculate_probability_sparseFormat(x, mean, stdev)
            else:
                x = 0
                probabilities[classValue] *= calculate_probability_sparseFormat(x, mean, stdev)
                
    return probabilities

def predict_sparseFormat(summaries, inputVector):
	probabilities = calculate_class_probabilities_sparseFormat(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.items():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def get_predictions_sparseFormat(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict_sparseFormat(summaries, testSet[i])
		predictions.append(result)
	return predictions

def pre_accuracy(trainPredictions):
    contain = []
    for i in trainPredictions:
        if i == 0:
            contain.append(1)
        else:
            contain.append(0)
    return contain

def sms_dataset_sparseFormat(splitRatio = 0.67):
    trainLine, trainDict = read_file_and_return_each_line_and_dictionary("sms_train.tsv")
    print("The train lines acount is: %d" %len(trainLine))
    print("The train dictionary words acount is: %d" %len(trainDict))
    linesVectors = lines_to_vectors_Training_sparseFormat(trainLine,trainDict)
    trainVectors, testVectors = split_dataset(linesVectors,splitRatio)
    summaries= summarize_by_class_sparseFormat(trainVectors)
    print("The train class ham summaries acount is: %d" %len(summaries[0]))
    print("The train class spam summaries acount is: %d" %len(summaries[1]))
    trainPredictions = get_predictions_sparseFormat(summaries, testVectors)
    trainAccuracy = get_accuracy(testVectors, trainPredictions)
    print("The Accuracy is: %f" %trainAccuracy)
    testFileLines,testFileDict = read_file_and_return_each_line_and_dictionary("sms_test.tsv")
    testVectors = lines_to_vectors_Training_sparseFormat(testFileLines, trainDict) 
    newSummaries = summarize_by_class_sparseFormat(linesVectors)
    testPredictions = get_predictions_sparseFormat(newSummaries,testVectors)
    #csvFileName = "SMS_results_sparseFormat.csv"
    csvFileName = "XUE_sms.csv"
    write_result_in_csv(testPredictions,csvFileName) 


def read_net_file_return_lines_and_dictionary(fileName):
    returnLine = []
    dictionary = {}
    fileLines = read_tsv_by_line(fileName)
    for line in fileLines:
        finalLine = []
        commaSplitedLine = line[0].split(",")
        if commaSplitedLine[0] == "ham":
            classFlag = "ham"
        else:
            classFlag = "spam"
        del commaSplitedLine[0]
        ipSplited = commaSplitedLine[0].split(".")
        mailSplited = commaSplitedLine[1].split("@")
        domainSplited = mailSplited[1].split(".")
        if "@" in commaSplitedLine[2]:
            mailSplited_one = commaSplitedLine[2].split("@")
            mailSplited_one[1] = mailSplited_one[1].split(">")
            domainSplited_one = mailSplited_one[1][0].split(".")
        else:
            domainSplited_one = commaSplitedLine[2]
        theStr = ""
        for ele in ipSplited:
            if ele in dictionary.keys():
                dictionary[ele] += 1
            else:
                dictionary[ele] = 1
            theStr = theStr + ele + " "
        for ele in domainSplited:
            if ele in dictionary.keys():
                dictionary[ele] += 1
            else:
                dictionary[ele] = 1
            theStr = theStr + ele + " "
        for ele in domainSplited_one:
            if ele in dictionary.keys():
                dictionary[ele] += 1
            else:
                dictionary[ele] = 1
            theStr = theStr + ele + " "
        finalLine = [classFlag, theStr]
        returnLine.append(finalLine)
    return returnLine, dictionary

def net_dataset_sparseFormat(splitRatio = 0.67):
    trainLine, trainDict = read_net_file_return_lines_and_dictionary("net_train.csv")
    print("The train lines acount is: %d" %len(trainLine))
    print("The train dictionary words acount is: %d" %len(trainDict))
    linesVectors = lines_to_vectors_Training_sparseFormat(trainLine,trainDict)
    trainVectors, testVectors = split_dataset(linesVectors,splitRatio)
    summaries= summarize_by_class_sparseFormat(trainVectors)
    print("The train class ham summaries acount is: %d" %len(summaries[0]))
    print("The train class spam summaries acount is: %d" %len(summaries[1]))
    trainPredictions = get_predictions_sparseFormat(summaries, testVectors)
    #trainPredictions = pre_accuracy(trainPredictions)
    trainAccuracy = get_accuracy(testVectors, trainPredictions)
    print("The Accuracy is: %f" %trainAccuracy)
    testFileLines,testFileDict = read_net_file_return_lines_and_dictionary("net_test.csv")
    testVectors = lines_to_vectors_Training_sparseFormat(testFileLines, trainDict) 
    newSummaries = summarize_by_class_sparseFormat(linesVectors)
    testPredictions = get_predictions_sparseFormat(newSummaries,testVectors)
    #testPredictions = pre_accuracy(testPredictions)
    csvFileName = "XUE_net.csv"
    write_result_in_csv(testPredictions,csvFileName) 


def net_dataset(splitRatio = 0.67):
    trainLine, trainDict = read_net_file_return_lines_and_dictionary("net_train.csv")
    print("The train lines acount is: %d" %len(trainLine))
    print("The train dictionary words acount is: %d" %len(trainDict))
    linesVectors = lines_to_vectors_Training(trainLine,trainDict)
    trainVectors, testVectors = split_dataset(linesVectors,splitRatio)
    summaries= summarize_by_class(trainVectors)
    print("The train class ham summaries acount is: %d" %len(summaries[0]))
    print("The train class spam summaries acount is: %d" %len(summaries[1]))
    trainPredictions = get_predictions(summaries, testVectors)
    trainAccuracy = get_accuracy(testVectors, trainPredictions)
    print("The Accuracy is: %f" %trainAccuracy)
    testFileLines,testFileDict = read_net_file_return_lines_and_dictionary("net_test.csv")
    testVectors = lines_to_vectors_Training(testFileLines, trainDict) 
    newSummaries = summarize_by_class(linesVectors)
    testPredictions = get_predictions(newSummaries,testVectors)
    csvFileName = "XUE_net_notSparse.csv"
    write_result_in_csv(testPredictions,csvFileName) 


In [4]:
sms_dataset_sparseFormat()

The train lines acount is: 5074
The train dictionary words acount is: 10781
The train class ham summaries acount is: 6794
The train class spam summaries acount is: 2993
The Accuracy is: 86.149254


In [5]:
sms_dataset()

The train lines acount is: 5074
The train dictionary words acount is: 10781
The train class ham summaries acount is: 10781
The train class spam summaries acount is: 10781
The Accuracy is: 90.388060


In [2]:
net_dataset_sparseFormat()

The train lines acount is: 13000
The train dictionary words acount is: 12244
The train class ham summaries acount is: 7071
The train class spam summaries acount is: 3771
The Accuracy is: 70.582751


In [2]:
net_dataset()

The train lines acount is: 13000
The train dictionary words acount is: 12244
The train class ham summaries acount is: 12244
The train class spam summaries acount is: 12244
The Accuracy is: 71.235431


In [42]:
trainLine, trainDict = read_net_file_return_lines_and_dictionary("net_train.csv")

In [43]:
trainLine

[['spam', '88 254 149 243 oxford gov uk localhost localdomain '],
 ['ham', '87 74 17 32 conferencesandreports com conferencesandreports com '],
 ['spam', '222 35 64 22 ipt ch m23 mailyes net '],
 ['ham', '59 95 246 177 ciionline org xra '],
 ['ham', '86 40 222 31 playtex com sgss '],
 ['ham', '222 146 6 125 lille iufm fr Jamel '],
 ['ham', '124 57 160 56 tom com localhost localdomain '],
 ['ham', '131 112 222 61 budgetware com budgetware com '],
 ['ham', '61 95 205 35 elecda cl aghjx '],
 ['ham', '69 147 83 53 freebsd org cederstrand dk '],
 ['spam', '201 17 10 187 webinit ch afms com '],
 ['ham', '62 146 110 162 jfnet2 jfnet2 de jfnet2 jfnet2 de '],
 ['ham', '66 179 20 189 list wimaxforum org ILEXC3U01 ndc lucent com '],
 ['ham', '82 76 136 224 mail2suzie com cellularnext com '],
 ['ham', '59 106 116 159 smail science-t com oem63 blayn com '],
 ['spam', '58 9 31 216 chez-yvette com isd '],
 ['ham', '210 150 254 124 kymp fi hcluji '],
 ['spam', '74 77 7 207 sunvice east sun com airtel 

In [44]:
trainDict

{'88': 345,
 '254': 177,
 '149': 116,
 '243': 114,
 'oxford': 1,
 'gov': 45,
 'uk': 311,
 'localhost': 1550,
 'localdomain': 1529,
 '87': 328,
 '74': 183,
 '17': 272,
 '32': 668,
 'conferencesandreports': 46,
 'com': 11633,
 '222': 1023,
 '35': 155,
 '64': 250,
 '22': 194,
 'ipt': 4,
 'ch': 97,
 'm23': 80,
 'mailyes': 691,
 'net': 3023,
 '59': 457,
 '95': 127,
 '246': 147,
 '177': 143,
 'ciionline': 1,
 'org': 1743,
 'xra': 4,
 '86': 141,
 '40': 114,
 '31': 107,
 'playtex': 1,
 'sgss': 1,
 '146': 683,
 '6': 798,
 '125': 1237,
 'lille': 21,
 'iufm': 21,
 'fr': 181,
 'Jamel': 18,
 '124': 503,
 '57': 267,
 '160': 285,
 '56': 122,
 'tom': 155,
 '131': 214,
 '112': 180,
 '61': 407,
 'budgetware': 6,
 '205': 184,
 'elecda': 1,
 'cl': 31,
 'aghjx': 1,
 '69': 161,
 '147': 157,
 '83': 308,
 '53': 285,
 'freebsd': 36,
 'cederstrand': 1,
 'dk': 54,
 '201': 486,
 '10': 152,
 '187': 628,
 'webinit': 1,
 'afms': 2,
 '62': 199,
 '110': 244,
 '162': 209,
 'jfnet2': 4,
 'de': 414,
 '66': 369,
 '179': 2

In [45]:
linesVectors = lines_to_vectors_Training_sparseFormat(trainLine,trainDict)

In [30]:
linesVectors

[[{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 1206: 1}, 1],
 [{9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 1206: 1}, 0],
 [{15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 1206: 1}, 1],
 [{24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 1206: 1}, 0],
 [{14: 1, 15: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 1206: 1}, 0],
 [{15: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 1206: 1}, 0],
 [{7: 1, 8: 1, 14: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 1206: 1}, 0],
 [{14: 2, 15: 1, 48: 1, 49: 1, 50: 1, 51: 2, 1206: 1}, 0],
 [{16: 1, 25: 1, 50: 1, 52: 1, 53: 1, 54: 1, 55: 1, 1206: 1}, 0],
 [{29: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 1206: 1}, 0],
 [{11: 1, 14: 1, 20: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 1206: 1}, 1],
 [{36: 1, 68: 1, 69: 1, 70: 1, 71: 4, 72: 2, 1206: 1}, 0],
 [{14: 1,
   29: 1,
   73: 1,
   74: 1,
   75: 1,
   76: 1,
   77: 1,
   78: 1,
   79: 1,
   80: 1,
   81: 1,
   1206: 1},
  0],
 [{14: 2, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1, 1206

In [46]:
trainVectors, testVectors = split_dataset(linesVectors,splitRatio)

In [47]:
summaries= summarize_by_class_sparseFormat(trainVectors)

In [48]:
summaries

{0: {100: (0.04816482466912266, 0.21411439543784883),
  455: (0.015691090189657524, 0.12427743108995112),
  45: (0.024969300040933277, 0.15603151635614862),
  271: (0.023468413153226907, 0.15138575473701732),
  349: (0.02155819347796425, 0.20184501870155858),
  23: (0.24805566925910766, 0.5566593819603546),
  270: (0.012007095101650975, 0.11016266742371657),
  1206: (1.0019102196752627, 0.04366429589555779),
  203: (0.058807477145586026, 0.23584326639188719),
  11: (0.02169463774048301, 0.14568452365914605),
  27: (0.009687542638832036, 0.0979474050521648),
  57: (0.013235093464319826, 0.11428003222484028),
  2275: (0.0009551098376313276, 0.03089008907124399),
  686: (0.0008186655751125665, 0.02860061820640757),
  43: (0.03697639514258425, 0.1894255259195058),
  44: (0.027015963978714697, 0.16212989134954583),
  196: (0.010369763951425843, 0.10130267492528301),
  18: (0.01623686723973257, 0.12638524985919788),
  173: (0.025378632828489563, 0.1572722411115399),
  14: (0.8739255014326648

In [49]:
trainPredictions = get_predictions_sparseFormat(summaries, testVectors)

In [35]:
trainPredictions

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [38]:
def pre_accuracy(trainPredictions):
    contain = []
    for i in trainPredictions:
        if i == 0:
            contain.append(1)
        else:
            contain.append(0)
    return contain


26

In [39]:
trainAccuracy = get_accuracy(testVectors, contain)
#trainAccuracy = get_accuracy(testVectors, trainPredictions)

In [40]:
print("The Accuracy is: %f" %trainAccuracy)

The Accuracy is: 69.500000


In [None]:
#testFileLines,testFileDict = read_net_file_return_lines_and_dictionary("net_test.csv")

In [None]:
#testVectors = lines_to_vectors_Training_sparseFormat(testFileLines, trainDict) 

In [None]:
#newSummaries = summarize_by_class_sparseFormat(linesVectors)

In [None]:
#testPredictions = get_predictions_sparseFormat(newSummaries,testVectors)

In [None]:
#csvFileName = "XUE_net.csv"
#write_result_in_csv(testPredictions,csvFileName) 

In [None]:
#linesVectors = lines_to_vectors_Training_sparseFormat(trainLine,trainDict)
#trainVectors, testVectors = split_dataset(linesVectors,splitRatio)
#summaries= summarize_by_class_sparseFormat(trainVectors)
#trainPredictions = get_predictions_sparseFormat(summaries, testVectors)
#trainAccuracy = get_accuracy(testVectors, trainPredictions)
#print("The Accuracy is: %f" %trainAccuracy)
#testFileLines,testFileDict = read_net_file_return_lines_and_dictionary("net_test.csv")
#testVectors = lines_to_vectors_Training_sparseFormat(testFileLines, trainDict) 
#newSummaries = summarize_by_class_sparseFormat(linesVectors)
#testPredictions = get_predictions_sparseFormat(newSummaries,testVectors)
#csvFileName = "XUE_net.csv"
#write_result_in_csv(testPredictions,csvFileName) 