# Featurizes the code from Gerrit to make the predictions for verdicts

In [1]:
# This is a python program to create feature vectors from the programming language code
# in such a way that every distinct line is indeed distinct

# author: Miroslaw Staron

import re
import pandas as pd
import time

class FeatureMaker:
    """Class makes a feature vector base on the set of features defined in the class parameters"""

    def __init__(self):
        """The initial state of the feature vector is actually an empty vector"""
        self.featureVector = []

    def addNewFeature(self, newFeature):
        """Adds just one feature to the list of features"""
        self.featureVector.append(newFeature)

    def addNewFeatures(self, newFeatures):
        """Adds a list of features"""
        self.featureVector = self.featureVector + newFeatures
    
    def getFeatureVector(self):
        return self.featureVector

    def featurize(self, line):
        """Counts the frequency of each feature in a given line"""
        self.features = []
        for feature in self.featureVector:
            self.features.append(str(line.count(feature)))
        return self.features
    
    def featuresToString(self):
        strFeatures = '$'.join(str(e) for e in self.features)
        return strFeatures
    
    def findNewFeatures(self, lstTokens):
        newElements = list(set(lstTokens) - set(self.featureVector))
        return newElements

def tokenizeString(myString):
    """This function takes a line and returns a set of strings; empty strings are removed"""
    tokenList = '[\(|"|,|.|;|\)|\[|\]|{|}| ,|\n|\t]'
    tokens = re.split(tokenList, myString)
    tokens = list(filter(None, tokens))
    return tokens

class DataSet:
    """The class makes the connection between the code and its features"""

    def __init__(self):
        self.dictRows = {}
        self.featureVector = []
    
    def addFeatureVector(self, lstFeatureVector):
        self.featureVector = lstFeatureVector

    def addNewLine(self, strLine, lstFeatures):
        self.dictRows[lstFeatures] = strLine 

    def hasLine(self, lstFeatures):
        return (lstFeatures in self.dictRows.keys())
            
    def getLine(self, lstFeatures):
        return self.dictRows[lstFeatures]

    def toCSV(self, strFilename):
        fFile = open(strFilename, 'w', encoding='utf8')
        strFirstLine = 'line$'
        strFirstLine += '$'.join(self.featureVector) + '\n'
        fFile.write(strFirstLine)
        for key, value in self.dictRows.items():
            value = value.replace("\n", "").replace("$","").replace("\r","").replace("\t","")
            strToFile = f'{value}${key}\n'
            fFile.write(strToFile)
        fFile.close()
    
    def flush(self):
        self.dictRows = {}
        self.featureVector = []

def featurizeList(lstLines, strOutputFile):
    dtLines = DataSet()
    
    featurizer = FeatureMaker()

    initialFeatures = tokenizeString(lstLines[0])

    featurizer.addNewFeature(initialFeatures[0])

    start_time = time.time()
    
    foundNewFeature = True
    i = 1
    iLine = 0
    iLastLine = 0
    while foundNewFeature: 
        strTime = f'{(time.time() - start_time):.2f} sec.'
        start_time = time.time()
        foundNewFeature = False
        print(f'Pass number: {i}, time: {strTime}, last processed line: {iLastLine}')
        i += 1
        iLine = 0
        total = len(lstLines)
        for line in lstLines:
            iLine += 1
            if not foundNewFeature: 
                mFeatures = featurizer.featurize(line)  
                if not all(v == 0 for v in mFeatures):    
                    strFeatures = featurizer.featuresToString()
                    if not dtLines.hasLine(strFeatures):
                        dtLines.addNewLine(line, strFeatures)
                    else:
                        strLine = dtLines.getLine(strFeatures)
                        if strLine != line:
                            lineTokens = tokenizeString(line)
                            oldLineTokens = tokenizeString(strLine)
                            newFeatures = featurizer.findNewFeatures(lineTokens+oldLineTokens)
                            if len(newFeatures) > 0:
                                featurizer.addNewFeature(newFeatures[0])
                                foundNewFeature = True
                                dtLines.flush()
                                dtLines.addFeatureVector(featurizer.featureVector)
                                iLastLine = iLine
                                #print(f'Found new feature at line {iLine} of {total}')                                

    return dtLines

def featurizeListMultipleFeatures(lstLines, strOutputFile):
    dtLines = DataSet()
    
    featurizer = FeatureMaker()

    initialFeatures = tokenizeString(lstLines[0])

    featurizer.addNewFeatures(initialFeatures)
    
    start_time = time.time()
    
    foundNewFeature = True
    i = 1
    while foundNewFeature: 
        strTime = f'{(time.time() - start_time)} sec'
        start_time = time.time()
        foundNewFeature = False
        print(f'Pass number: {i}, time: {strTime}, last processed line: {iLine}')
        i += 1
        iLine = 0
        total = len(lstLines)
        for line in lstLines:
            iLine += 1
            if not foundNewFeature: 
                mFeatures = featurizer.featurize(line)  
                if not all(v == 0 for v in mFeatures):    
                    strFeatures = featurizer.featuresToString()
                    if not dtLines.hasLine(strFeatures):
                        dtLines.addNewLine(line, strFeatures)
                    else:
                        strLine = dtLines.getLine(strFeatures)
                        if strLine != line:
                            lineTokens = tokenizeString(line)
                            oldLineTokens = tokenizeString(strLine)
                            newFeatures = featurizer.findNewFeatures(lineTokens+oldLineTokens)
                            if len(newFeatures) > 0:
                                featurizer.addNewFeatures(newFeatures)
                                foundNewFeature = True
                                dtLines.flush()
                                dtLines.addFeatureVector(featurizer.featureVector)
                                #print(f'Found new feature at line {iLine} of {total}')                                

    return dtLines

In [2]:

# this is an recursive function to find a set of features for a set of lines
# the function goes through all lines and then checks which lines are identical given the feature list
# then it takes one of the tokens from the identical lines and makes a recursion
def findFeatureListIterative(lstLines):
    #print(f'Lines: {len(lstLines)}, features: {len(lstFeatures)}')
    
    featurizer = FeatureMaker()
    featureList = []

    if len(lstLines) > 0:
        initialFeatures = tokenizeString(' '.join(lstLines))
        featurizer.addNewFeature(initialFeatures[0])
        featureList.append(initialFeatures[0])
    else:
        return featureList

    featureAdded = True

    while featureAdded:
        dictLinesUnique = {}
        lstNotUnique = []
        featureAdded = False

        # featurizing all lines in this iteration
        for line in lstLines:
            mFeatures = featurizer.featurize(line)
            strFeatures = '$'.join(mFeatures)
            if not (strFeatures in dictLinesUnique.keys()):
                dictLinesUnique[strFeatures] = line
            else:
                lstNotUnique.append(line)
                lstNotUnique.append(dictLinesUnique[strFeatures])
        
        lstNotUnique = list(set(lstNotUnique))

        print(f'Non-unique lines remaining: {len(lstNotUnique)}, features found: {len(featureList)}')

        # and kicking-off the next recursion if necessary
        # by necessary I mean that there are lines that are not different
        if len(lstNotUnique) > 0:
            allLines = lstNotUnique
            getTokens = tokenizeString(' '.join(allLines))
            #getTokens.sort(reverse=True)
            for oneToken in getTokens:
                if not oneToken in featureList:
                    featureList.append(oneToken)
                    featurizer.addNewFeature(oneToken)
                    featureAdded = True
                    break
            if featureAdded:
                #featureList = findFeatureList(allLines, featureList)
                lstLines = lstNotUnique
        
    # returning the feature list
    return featureList

In [3]:
##
## This is a block where we test the featurizer class and the dataset class
##

strOutputFile = 'output_sf_for_verdicts_gromacs.csv'

dfCode = pd.read_csv('./gerrit_reviews_gromacs.csv', 
                    sep=';', 
                    error_bad_lines=False, 
                    warn_bad_lines=True, 
                    header=0, 
                    index_col=False)

mLines = [line for line in dfCode['LOC'] if str(line) != 'nan' ]

print(f'All Lines: {len(mLines)}')

mLines = list(set(mLines))

print(f'Unique Lines: {len(mLines)}')
#print(mLines)



All Lines: 139785
Unique Lines: 40416


In [4]:
features = findFeatureListIterative(mLines)

Non-unique lines remaining: 40416, features found: 1
Non-unique lines remaining: 40416, features found: 2
Non-unique lines remaining: 40414, features found: 3
Non-unique lines remaining: 40414, features found: 4
Non-unique lines remaining: 40413, features found: 5
Non-unique lines remaining: 40412, features found: 6
Non-unique lines remaining: 40411, features found: 7
Non-unique lines remaining: 40411, features found: 8
Non-unique lines remaining: 40410, features found: 9
Non-unique lines remaining: 40410, features found: 10
Non-unique lines remaining: 40408, features found: 11
Non-unique lines remaining: 40408, features found: 12
Non-unique lines remaining: 40408, features found: 13
Non-unique lines remaining: 40407, features found: 14
Non-unique lines remaining: 40401, features found: 15
Non-unique lines remaining: 40388, features found: 16
Non-unique lines remaining: 40370, features found: 17
Non-unique lines remaining: 40368, features found: 18
Non-unique lines remaining: 40362, fe

Non-unique lines remaining: 19813, features found: 152
Non-unique lines remaining: 19813, features found: 153
Non-unique lines remaining: 19812, features found: 154
Non-unique lines remaining: 19812, features found: 155
Non-unique lines remaining: 19619, features found: 156
Non-unique lines remaining: 19609, features found: 157
Non-unique lines remaining: 19593, features found: 158
Non-unique lines remaining: 19589, features found: 159
Non-unique lines remaining: 19574, features found: 160
Non-unique lines remaining: 19567, features found: 161
Non-unique lines remaining: 19567, features found: 162
Non-unique lines remaining: 19567, features found: 163
Non-unique lines remaining: 19526, features found: 164
Non-unique lines remaining: 19525, features found: 165
Non-unique lines remaining: 19525, features found: 166
Non-unique lines remaining: 19525, features found: 167
Non-unique lines remaining: 19512, features found: 168
Non-unique lines remaining: 19488, features found: 169
Non-unique

Non-unique lines remaining: 12317, features found: 301
Non-unique lines remaining: 12316, features found: 302
Non-unique lines remaining: 12315, features found: 303
Non-unique lines remaining: 12314, features found: 304
Non-unique lines remaining: 12002, features found: 305
Non-unique lines remaining: 12002, features found: 306
Non-unique lines remaining: 11996, features found: 307
Non-unique lines remaining: 11959, features found: 308
Non-unique lines remaining: 11958, features found: 309
Non-unique lines remaining: 11947, features found: 310
Non-unique lines remaining: 11947, features found: 311
Non-unique lines remaining: 11947, features found: 312
Non-unique lines remaining: 11946, features found: 313
Non-unique lines remaining: 11908, features found: 314
Non-unique lines remaining: 11905, features found: 315
Non-unique lines remaining: 11905, features found: 316
Non-unique lines remaining: 11902, features found: 317
Non-unique lines remaining: 11901, features found: 318
Non-unique

Non-unique lines remaining: 8699, features found: 452
Non-unique lines remaining: 8699, features found: 453
Non-unique lines remaining: 8699, features found: 454
Non-unique lines remaining: 8699, features found: 455
Non-unique lines remaining: 8699, features found: 456
Non-unique lines remaining: 8697, features found: 457
Non-unique lines remaining: 8697, features found: 458
Non-unique lines remaining: 8693, features found: 459
Non-unique lines remaining: 8693, features found: 460
Non-unique lines remaining: 8680, features found: 461
Non-unique lines remaining: 8680, features found: 462
Non-unique lines remaining: 8517, features found: 463
Non-unique lines remaining: 8517, features found: 464
Non-unique lines remaining: 8517, features found: 465
Non-unique lines remaining: 8517, features found: 466
Non-unique lines remaining: 8513, features found: 467
Non-unique lines remaining: 8513, features found: 468
Non-unique lines remaining: 8513, features found: 469
Non-unique lines remaining: 

Non-unique lines remaining: 7326, features found: 604
Non-unique lines remaining: 7324, features found: 605
Non-unique lines remaining: 7316, features found: 606
Non-unique lines remaining: 7316, features found: 607
Non-unique lines remaining: 7315, features found: 608
Non-unique lines remaining: 7315, features found: 609
Non-unique lines remaining: 7315, features found: 610
Non-unique lines remaining: 7315, features found: 611
Non-unique lines remaining: 7315, features found: 612
Non-unique lines remaining: 7315, features found: 613
Non-unique lines remaining: 7310, features found: 614
Non-unique lines remaining: 7310, features found: 615
Non-unique lines remaining: 7310, features found: 616
Non-unique lines remaining: 7291, features found: 617
Non-unique lines remaining: 7290, features found: 618
Non-unique lines remaining: 7290, features found: 619
Non-unique lines remaining: 7290, features found: 620
Non-unique lines remaining: 7288, features found: 621
Non-unique lines remaining: 

Non-unique lines remaining: 7093, features found: 756
Non-unique lines remaining: 7093, features found: 757
Non-unique lines remaining: 7091, features found: 758
Non-unique lines remaining: 7090, features found: 759
Non-unique lines remaining: 7088, features found: 760
Non-unique lines remaining: 7086, features found: 761
Non-unique lines remaining: 7086, features found: 762
Non-unique lines remaining: 7086, features found: 763
Non-unique lines remaining: 7086, features found: 764
Non-unique lines remaining: 7035, features found: 765
Non-unique lines remaining: 7035, features found: 766
Non-unique lines remaining: 7035, features found: 767
Non-unique lines remaining: 7035, features found: 768
Non-unique lines remaining: 7035, features found: 769
Non-unique lines remaining: 7035, features found: 770
Non-unique lines remaining: 7034, features found: 771
Non-unique lines remaining: 7034, features found: 772
Non-unique lines remaining: 7034, features found: 773
Non-unique lines remaining: 

Non-unique lines remaining: 6731, features found: 908
Non-unique lines remaining: 6731, features found: 909
Non-unique lines remaining: 6731, features found: 910
Non-unique lines remaining: 6731, features found: 911
Non-unique lines remaining: 6731, features found: 912
Non-unique lines remaining: 6731, features found: 913
Non-unique lines remaining: 6731, features found: 914
Non-unique lines remaining: 6731, features found: 915
Non-unique lines remaining: 6731, features found: 916
Non-unique lines remaining: 6731, features found: 917
Non-unique lines remaining: 6731, features found: 918
Non-unique lines remaining: 6731, features found: 919
Non-unique lines remaining: 6731, features found: 920
Non-unique lines remaining: 6731, features found: 921
Non-unique lines remaining: 6731, features found: 922
Non-unique lines remaining: 6731, features found: 923
Non-unique lines remaining: 6731, features found: 924
Non-unique lines remaining: 6731, features found: 925
Non-unique lines remaining: 

Non-unique lines remaining: 6467, features found: 1059
Non-unique lines remaining: 6466, features found: 1060
Non-unique lines remaining: 6466, features found: 1061
Non-unique lines remaining: 6466, features found: 1062
Non-unique lines remaining: 6462, features found: 1063
Non-unique lines remaining: 6462, features found: 1064
Non-unique lines remaining: 6462, features found: 1065
Non-unique lines remaining: 6460, features found: 1066
Non-unique lines remaining: 6460, features found: 1067
Non-unique lines remaining: 6458, features found: 1068
Non-unique lines remaining: 6458, features found: 1069
Non-unique lines remaining: 6458, features found: 1070
Non-unique lines remaining: 6455, features found: 1071
Non-unique lines remaining: 6455, features found: 1072
Non-unique lines remaining: 6455, features found: 1073
Non-unique lines remaining: 6455, features found: 1074
Non-unique lines remaining: 6455, features found: 1075
Non-unique lines remaining: 6451, features found: 1076
Non-unique

Non-unique lines remaining: 5945, features found: 1208
Non-unique lines remaining: 5945, features found: 1209
Non-unique lines remaining: 5945, features found: 1210
Non-unique lines remaining: 5945, features found: 1211
Non-unique lines remaining: 5945, features found: 1212
Non-unique lines remaining: 5945, features found: 1213
Non-unique lines remaining: 5943, features found: 1214
Non-unique lines remaining: 5936, features found: 1215
Non-unique lines remaining: 5936, features found: 1216
Non-unique lines remaining: 5936, features found: 1217
Non-unique lines remaining: 5936, features found: 1218
Non-unique lines remaining: 5936, features found: 1219
Non-unique lines remaining: 5935, features found: 1220
Non-unique lines remaining: 5935, features found: 1221
Non-unique lines remaining: 5935, features found: 1222
Non-unique lines remaining: 5935, features found: 1223
Non-unique lines remaining: 5928, features found: 1224
Non-unique lines remaining: 5923, features found: 1225
Non-unique

Non-unique lines remaining: 5496, features found: 1357
Non-unique lines remaining: 5496, features found: 1358
Non-unique lines remaining: 5496, features found: 1359
Non-unique lines remaining: 5496, features found: 1360
Non-unique lines remaining: 5496, features found: 1361
Non-unique lines remaining: 5496, features found: 1362
Non-unique lines remaining: 5496, features found: 1363
Non-unique lines remaining: 5496, features found: 1364
Non-unique lines remaining: 5496, features found: 1365
Non-unique lines remaining: 5495, features found: 1366
Non-unique lines remaining: 5493, features found: 1367
Non-unique lines remaining: 5493, features found: 1368
Non-unique lines remaining: 5493, features found: 1369
Non-unique lines remaining: 5493, features found: 1370
Non-unique lines remaining: 5493, features found: 1371
Non-unique lines remaining: 5491, features found: 1372
Non-unique lines remaining: 5491, features found: 1373
Non-unique lines remaining: 5491, features found: 1374
Non-unique

Non-unique lines remaining: 5319, features found: 1506
Non-unique lines remaining: 5311, features found: 1507
Non-unique lines remaining: 5311, features found: 1508
Non-unique lines remaining: 5311, features found: 1509
Non-unique lines remaining: 5311, features found: 1510
Non-unique lines remaining: 5307, features found: 1511
Non-unique lines remaining: 5307, features found: 1512
Non-unique lines remaining: 5307, features found: 1513
Non-unique lines remaining: 5305, features found: 1514
Non-unique lines remaining: 5305, features found: 1515
Non-unique lines remaining: 5305, features found: 1516
Non-unique lines remaining: 5305, features found: 1517
Non-unique lines remaining: 5305, features found: 1518
Non-unique lines remaining: 5305, features found: 1519
Non-unique lines remaining: 5305, features found: 1520
Non-unique lines remaining: 5305, features found: 1521
Non-unique lines remaining: 5305, features found: 1522
Non-unique lines remaining: 5305, features found: 1523
Non-unique

Non-unique lines remaining: 5146, features found: 1655
Non-unique lines remaining: 5146, features found: 1656
Non-unique lines remaining: 5146, features found: 1657
Non-unique lines remaining: 5146, features found: 1658
Non-unique lines remaining: 5145, features found: 1659
Non-unique lines remaining: 5145, features found: 1660
Non-unique lines remaining: 5145, features found: 1661
Non-unique lines remaining: 5143, features found: 1662
Non-unique lines remaining: 5143, features found: 1663
Non-unique lines remaining: 5143, features found: 1664
Non-unique lines remaining: 5143, features found: 1665
Non-unique lines remaining: 5143, features found: 1666
Non-unique lines remaining: 5143, features found: 1667
Non-unique lines remaining: 5143, features found: 1668
Non-unique lines remaining: 5143, features found: 1669
Non-unique lines remaining: 5143, features found: 1670
Non-unique lines remaining: 5143, features found: 1671
Non-unique lines remaining: 5143, features found: 1672
Non-unique

Non-unique lines remaining: 5028, features found: 1804
Non-unique lines remaining: 5026, features found: 1805
Non-unique lines remaining: 5026, features found: 1806
Non-unique lines remaining: 5026, features found: 1807
Non-unique lines remaining: 5026, features found: 1808
Non-unique lines remaining: 5026, features found: 1809
Non-unique lines remaining: 5026, features found: 1810
Non-unique lines remaining: 5026, features found: 1811
Non-unique lines remaining: 5026, features found: 1812
Non-unique lines remaining: 5026, features found: 1813
Non-unique lines remaining: 5025, features found: 1814
Non-unique lines remaining: 5025, features found: 1815
Non-unique lines remaining: 5024, features found: 1816
Non-unique lines remaining: 5024, features found: 1817
Non-unique lines remaining: 5024, features found: 1818
Non-unique lines remaining: 5024, features found: 1819
Non-unique lines remaining: 5023, features found: 1820
Non-unique lines remaining: 5023, features found: 1821
Non-unique

Non-unique lines remaining: 4962, features found: 1953
Non-unique lines remaining: 4962, features found: 1954
Non-unique lines remaining: 4962, features found: 1955
Non-unique lines remaining: 4962, features found: 1956
Non-unique lines remaining: 4962, features found: 1957
Non-unique lines remaining: 4962, features found: 1958
Non-unique lines remaining: 4962, features found: 1959
Non-unique lines remaining: 4962, features found: 1960
Non-unique lines remaining: 4962, features found: 1961
Non-unique lines remaining: 4962, features found: 1962
Non-unique lines remaining: 4962, features found: 1963
Non-unique lines remaining: 4962, features found: 1964
Non-unique lines remaining: 4962, features found: 1965
Non-unique lines remaining: 4954, features found: 1966
Non-unique lines remaining: 4954, features found: 1967
Non-unique lines remaining: 4954, features found: 1968
Non-unique lines remaining: 4949, features found: 1969
Non-unique lines remaining: 4948, features found: 1970
Non-unique

Non-unique lines remaining: 4878, features found: 2102
Non-unique lines remaining: 4878, features found: 2103
Non-unique lines remaining: 4878, features found: 2104
Non-unique lines remaining: 4878, features found: 2105
Non-unique lines remaining: 4878, features found: 2106
Non-unique lines remaining: 4878, features found: 2107
Non-unique lines remaining: 4878, features found: 2108
Non-unique lines remaining: 4878, features found: 2109
Non-unique lines remaining: 4878, features found: 2110
Non-unique lines remaining: 4878, features found: 2111
Non-unique lines remaining: 4876, features found: 2112
Non-unique lines remaining: 4876, features found: 2113
Non-unique lines remaining: 4876, features found: 2114
Non-unique lines remaining: 4842, features found: 2115
Non-unique lines remaining: 4842, features found: 2116
Non-unique lines remaining: 4840, features found: 2117
Non-unique lines remaining: 4838, features found: 2118
Non-unique lines remaining: 4838, features found: 2119
Non-unique

Non-unique lines remaining: 4770, features found: 2251
Non-unique lines remaining: 4770, features found: 2252
Non-unique lines remaining: 4770, features found: 2253
Non-unique lines remaining: 4770, features found: 2254
Non-unique lines remaining: 4768, features found: 2255
Non-unique lines remaining: 4768, features found: 2256
Non-unique lines remaining: 4768, features found: 2257
Non-unique lines remaining: 4768, features found: 2258
Non-unique lines remaining: 4768, features found: 2259
Non-unique lines remaining: 4768, features found: 2260
Non-unique lines remaining: 4768, features found: 2261
Non-unique lines remaining: 4768, features found: 2262
Non-unique lines remaining: 4768, features found: 2263
Non-unique lines remaining: 4768, features found: 2264
Non-unique lines remaining: 4768, features found: 2265
Non-unique lines remaining: 4768, features found: 2266
Non-unique lines remaining: 4768, features found: 2267
Non-unique lines remaining: 4768, features found: 2268
Non-unique

Non-unique lines remaining: 4759, features found: 2400
Non-unique lines remaining: 4759, features found: 2401
Non-unique lines remaining: 4759, features found: 2402
Non-unique lines remaining: 4759, features found: 2403
Non-unique lines remaining: 4759, features found: 2404
Non-unique lines remaining: 4759, features found: 2405
Non-unique lines remaining: 4759, features found: 2406
Non-unique lines remaining: 4759, features found: 2407
Non-unique lines remaining: 4759, features found: 2408
Non-unique lines remaining: 4759, features found: 2409
Non-unique lines remaining: 4759, features found: 2410
Non-unique lines remaining: 4757, features found: 2411
Non-unique lines remaining: 4757, features found: 2412
Non-unique lines remaining: 4757, features found: 2413
Non-unique lines remaining: 4757, features found: 2414
Non-unique lines remaining: 4757, features found: 2415
Non-unique lines remaining: 4757, features found: 2416
Non-unique lines remaining: 4757, features found: 2417
Non-unique

Non-unique lines remaining: 4744, features found: 2549
Non-unique lines remaining: 4744, features found: 2550
Non-unique lines remaining: 4744, features found: 2551
Non-unique lines remaining: 4744, features found: 2552
Non-unique lines remaining: 4744, features found: 2553
Non-unique lines remaining: 4742, features found: 2554
Non-unique lines remaining: 4742, features found: 2555
Non-unique lines remaining: 4742, features found: 2556
Non-unique lines remaining: 4742, features found: 2557
Non-unique lines remaining: 4742, features found: 2558
Non-unique lines remaining: 4742, features found: 2559
Non-unique lines remaining: 4742, features found: 2560
Non-unique lines remaining: 4742, features found: 2561
Non-unique lines remaining: 4741, features found: 2562
Non-unique lines remaining: 4741, features found: 2563
Non-unique lines remaining: 4741, features found: 2564
Non-unique lines remaining: 4741, features found: 2565
Non-unique lines remaining: 4741, features found: 2566
Non-unique

Non-unique lines remaining: 4727, features found: 2698
Non-unique lines remaining: 4727, features found: 2699
Non-unique lines remaining: 4727, features found: 2700
Non-unique lines remaining: 4727, features found: 2701
Non-unique lines remaining: 4727, features found: 2702
Non-unique lines remaining: 4727, features found: 2703
Non-unique lines remaining: 4727, features found: 2704
Non-unique lines remaining: 4727, features found: 2705
Non-unique lines remaining: 4727, features found: 2706
Non-unique lines remaining: 4727, features found: 2707
Non-unique lines remaining: 4727, features found: 2708
Non-unique lines remaining: 4727, features found: 2709
Non-unique lines remaining: 4727, features found: 2710
Non-unique lines remaining: 4727, features found: 2711
Non-unique lines remaining: 4727, features found: 2712
Non-unique lines remaining: 4727, features found: 2713
Non-unique lines remaining: 4727, features found: 2714
Non-unique lines remaining: 4727, features found: 2715
Non-unique

Non-unique lines remaining: 4690, features found: 2847
Non-unique lines remaining: 4690, features found: 2848
Non-unique lines remaining: 4690, features found: 2849
Non-unique lines remaining: 4688, features found: 2850
Non-unique lines remaining: 4688, features found: 2851
Non-unique lines remaining: 4688, features found: 2852
Non-unique lines remaining: 4688, features found: 2853
Non-unique lines remaining: 4688, features found: 2854
Non-unique lines remaining: 4688, features found: 2855
Non-unique lines remaining: 4688, features found: 2856
Non-unique lines remaining: 4688, features found: 2857
Non-unique lines remaining: 4688, features found: 2858
Non-unique lines remaining: 4688, features found: 2859
Non-unique lines remaining: 4688, features found: 2860
Non-unique lines remaining: 4688, features found: 2861
Non-unique lines remaining: 4688, features found: 2862
Non-unique lines remaining: 4688, features found: 2863
Non-unique lines remaining: 4688, features found: 2864
Non-unique

Non-unique lines remaining: 4671, features found: 2996
Non-unique lines remaining: 4671, features found: 2997
Non-unique lines remaining: 4671, features found: 2998
Non-unique lines remaining: 4671, features found: 2999
Non-unique lines remaining: 4671, features found: 3000
Non-unique lines remaining: 4671, features found: 3001
Non-unique lines remaining: 4671, features found: 3002
Non-unique lines remaining: 4671, features found: 3003
Non-unique lines remaining: 4671, features found: 3004
Non-unique lines remaining: 4671, features found: 3005
Non-unique lines remaining: 4671, features found: 3006
Non-unique lines remaining: 4671, features found: 3007
Non-unique lines remaining: 4671, features found: 3008
Non-unique lines remaining: 4671, features found: 3009
Non-unique lines remaining: 4671, features found: 3010
Non-unique lines remaining: 4671, features found: 3011
Non-unique lines remaining: 4671, features found: 3012
Non-unique lines remaining: 4671, features found: 3013
Non-unique

Non-unique lines remaining: 4653, features found: 3145
Non-unique lines remaining: 4653, features found: 3146
Non-unique lines remaining: 4653, features found: 3147
Non-unique lines remaining: 4653, features found: 3148
Non-unique lines remaining: 4653, features found: 3149
Non-unique lines remaining: 4653, features found: 3150
Non-unique lines remaining: 4653, features found: 3151
Non-unique lines remaining: 4653, features found: 3152
Non-unique lines remaining: 4653, features found: 3153
Non-unique lines remaining: 4653, features found: 3154
Non-unique lines remaining: 4653, features found: 3155
Non-unique lines remaining: 4653, features found: 3156
Non-unique lines remaining: 4653, features found: 3157
Non-unique lines remaining: 4653, features found: 3158
Non-unique lines remaining: 4653, features found: 3159
Non-unique lines remaining: 4653, features found: 3160
Non-unique lines remaining: 4653, features found: 3161
Non-unique lines remaining: 4653, features found: 3162
Non-unique

Non-unique lines remaining: 4640, features found: 3294
Non-unique lines remaining: 4640, features found: 3295
Non-unique lines remaining: 4640, features found: 3296
Non-unique lines remaining: 4640, features found: 3297
Non-unique lines remaining: 4638, features found: 3298
Non-unique lines remaining: 4638, features found: 3299
Non-unique lines remaining: 4638, features found: 3300
Non-unique lines remaining: 4638, features found: 3301
Non-unique lines remaining: 4638, features found: 3302
Non-unique lines remaining: 4638, features found: 3303
Non-unique lines remaining: 4638, features found: 3304
Non-unique lines remaining: 4637, features found: 3305
Non-unique lines remaining: 4636, features found: 3306
Non-unique lines remaining: 4636, features found: 3307
Non-unique lines remaining: 4636, features found: 3308
Non-unique lines remaining: 4636, features found: 3309
Non-unique lines remaining: 4636, features found: 3310
Non-unique lines remaining: 4636, features found: 3311
Non-unique

Non-unique lines remaining: 4608, features found: 3443
Non-unique lines remaining: 4608, features found: 3444
Non-unique lines remaining: 4607, features found: 3445
Non-unique lines remaining: 4607, features found: 3446
Non-unique lines remaining: 4607, features found: 3447
Non-unique lines remaining: 4605, features found: 3448
Non-unique lines remaining: 4603, features found: 3449


In [8]:
def featurizeListPredefined(lstLines, lstFeatures):
    dtLines = DataSet()
    
    dtLines.addFeatureVector(lstFeatures)
    
    featurizer = FeatureMaker()

    featurizer.addNewFeatures(lstFeatures)

    foundNewFeature = True
    i = 1
    while foundNewFeature:
        foundNewFeature = False
        print(f'Pass number: {i}')
        i += 1
        iLine = 0
        total = len(lstLines)
        for line in lstLines:
            iLine += 1
            if not foundNewFeature: 
                mFeatures = featurizer.featurize(line)  
                if not all(v == 0 for v in mFeatures):    
                    strFeatures = featurizer.featuresToString()
                    if not dtLines.hasLine(strFeatures):
                        dtLines.addNewLine(line, strFeatures)
                    else:
                        strLine = dtLines.getLine(strFeatures)
                        if strLine != line:
                            lineTokens = tokenizeString(line)
                            oldLineTokens = tokenizeString(strLine)
                            newFeatures = featurizer.findNewFeatures(lineTokens+oldLineTokens)
                            if len(newFeatures) > 0:
                                featurizer.addNewFeature(newFeatures[0])
                                foundNewFeature = True
                                dtLines.flush()
                                dtLines.addFeatureVector(featurizer.featureVector)
                                print(f'Found new feature at line {iLine} of {total}')                           

    return dtLines

In [9]:
print(len(features))

3449


In [10]:
strOutputFile = './featurized_lines.csv'
dtLines = featurizeListPredefined(mLines, features)
dtLines.toCSV(strOutputFile)

Pass number: 1
