public
Description: Python backup classes to backup files to key/value store, such as Amazon S3
Homepage: http://www.1729.com/software/keevalbak/index.html
Clone URL: git://github.com/pdorrell/keevalbak.git
keevalbak / keevalbak / BackupOperations.py
100644 926 lines (808 sloc) 43.501 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
# Copyright (c) 2008 Philip Dorrell, http://www.1729.com/
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
 
import yaml
import hashlib
import os
import time
import datetime
import shutil
import CompareDirectories
import re
from sets import Set
 
def readFileBytes(filename):
    """Read named file and return contents as a byte string"""
    f = file(filename, "rb")
    bytes = f.read()
    f.close()
    return bytes
 
def writeFileBytes(filename, bytes):
    """Write byte string as new contents of named file"""
    f = file(filename, "wb")
    f.write(bytes)
    f.close()
    
BackupsVersion = 2
 
class PathSummary(object):
    """Information about a file or directory specified as a relative path within some base directory
Note: all paths are '/' separated, whether or not we are in Microsoft Windows"""
    def __init__(self, relativePath):
        self.relativePath = relativePath
 
    def fullPath(self, basePath):
        """Return the full path given the path of the base directory"""
        return basePath + self.relativePath
 
    @staticmethod
    def fromYamlData(data):
        """Convert YAML data into FileSummary or DirSummary (inverse of toYamlData methods)"""
        pathType = data["type"]
        if pathType == "file":
            return FileSummary.fromYamlData(data)
        elif pathType == "dir":
            return DirSummary.fromYamlData(data)
        else:
            raise "Unknown path type: %s" % pathType
 
class FileSummary(PathSummary):
    """Information about a file specified as a relative path within some (unspecified) base directory,
including a SHA1 hash of the file's contents."""
    def __init__(self, relativePath, hash):
        super(FileSummary, self).__init__(relativePath)
        self.isDir = False
        self.isFile = True
        self.hash = hash
        
    def __unicode__(self):
        return u"FILE: %r : %s" % (self.relativePath, self.hash)
        
    def __repr__(self):
        return self.__unicode__()
    
    def toYamlData(self):
        """Convert to YAML"""
        return {"type": "file",
                "path": self.relativePath,
                "hash": self.hash }
    
    @staticmethod
    def fromYamlData(data):
        """Create from YAML (inverse of toYamlData)"""
        return FileSummary(data["path"], data["hash"])
 
class DirSummary(PathSummary):
    """Information about a file specified as a relative path within some (unspecified) base directory"""
    def __init__(self, relativePath):
        super(DirSummary, self).__init__(relativePath)
        self.isDir = True
        self.isFile = False
        
    def __unicode__(self):
        return u"DIR: %r" % (self.relativePath)
        
    def toYamlData(self):
        """Convert to YAML"""
        return {"type": "dir",
                "path": self.relativePath
                }
    
    def __repr__(self):
        return self.__unicode__()
    
    @staticmethod
    def fromYamlData(data):
        """Create from YAML (inverse of toYamlData)"""
        return DirSummary(data["path"])
    
def sha1Digest(content):
    return hashlib.sha1(content).hexdigest()
 
class DirectoryInfo:
    """Information about all the directories and files within a base directory
All directories are listed before any subdirectories or files contained within them.
"""
    def __init__(self, path):
        """Construct from path base directory"""
        self.path = unicode(path)
        self.pathSummaries = []
        self.summarizeSubDir(u"")
        
    def createDirSummary(self, relativePath):
        """Create a path summary for a sub-directory"""
        return DirSummary (relativePath)
    
    def createFileSummary(self, relativePath):
        """Create a path summary for a file in the base directory"""
        fileName = self.path + relativePath
        content = readFileBytes(fileName)
        fileHash = sha1Digest(content)
        return FileSummary (relativePath, fileHash)
    
    def addSummary(self, pathSummary):
        """Add a path summary"""
        print u"%r" % pathSummary
        self.pathSummaries.append (pathSummary)
        
    def getPathSummariesYamlData(self):
        """Return array of path summaries as YAML data"""
        return [summary.toYamlData() for summary in self.pathSummaries]
    
    def summarizeSubDir(self, relativePath):
        """Recursively summarize a sub-directory specified by it's relative path,
adding the path summaries for all contained files and sub-directories to the list of path summaries."""
        for childName in os.listdir(self.path + relativePath):
            childRelativePath = relativePath + "/" + childName;
            childPath = self.path + childRelativePath
            if os.path.isfile(childPath):
                self.addSummary(self.createFileSummary(childRelativePath))
            elif os.path.isdir(childPath):
                self.addSummary(self.createDirSummary(childRelativePath))
                self.summarizeSubDir (childRelativePath)
            else:
                print "UNKNOWN OBJECT %r in %r" % (childName, self.path + relativePath)
                
class HashVerificationRecords(object):
    """Records of verified hashes of backed up files (i.e. verified by actually reading
the file content out of the backup map and recalculating the hash).
Note that this class is not yet used, and nothing is yet writing the verification records
into the backup map."""
    
    def __init__(self, backupMap):
        self.backupMap = backupMap
        self.datetimeFileHashesMap = {}
        self.datetimeUpdated = Set()
        
    def getFileHashesMap(self, datetime):
        if datetime in self.datetimeFileHashesMap:
            fileHashesMap = self.datetimeFileHashesMap[datetime]
        else:
            fileHashesRecordFilename = datetime + "/verifiedFileHashes.yaml"
            if fileHashesRecordFilename in self.backupMap:
                fileHashesMap = yaml.safe_load(self.backupMap[fileHashesRecordFilename])
            else:
                fileHashesMap = {}
            self.datetimeFileHashesMap[datetime] = fileHashesMap
        return fileHashesMap
        
    def markVerified(self, datetime, filePath, contentHash):
        fileHashesMap = self.getFileHashesMap(datetime)
        fileHashesMap[filePath] = contentHash
        self.datetimeUpdated.add (datetime)
        
    def getWrittenFileHash(self, datetime, filePath):
        """Get the hash of a backed up file, either from an existing hash verification record,
or, read the file contents from the backup map and calculate the hash."""
        fileHashesMap = self.getFileHashesMap(datetime)
        if filePath in fileHashesMap:
            return fileHashesMap[filePath]
        else:
            content = self.backupMap[datetime + "/files" + filePath]
            contentHash = sha1Digest(content)
            self.markVerified(datetime, filePath, contentHash)
            return contentHash
        
    def updateRecords(self):
        """Update any newly verified hashes back into the backup map."""
        print "Verified hashes were updated for %r" % self.datetimeUpdated
        for datetime in self.datetimeUpdated:
            fileHashesRecordFilename = datetime + "/verifiedFileHashes.yaml"
            print "Updating verification records for %s = %s" % (datetime,
                                                                 self.datetimeFileHashesMap[datetime])
            self.backupMap[fileHashesRecordFilename] = yaml.safe_dump (self.datetimeFileHashesMap[datetime])
            
class BackupRecord:
    """A record of a backup made: it's date/time, and whether it was full or incremental."""
    def __init__(self, type, datetime, completed):
        """construct from 'full' or 'incremental' and the date time"""
        self.type = type
        self.datetime = datetime
        self.completed = completed
 
    @staticmethod
    def fromYamlData(data):
        """Construct backup record from YAML data (inverse of toYamlData)"""
        # completed defaults to True because previous version of keevalback only recorded when complete
        return BackupRecord(data["type"], data["datetime"], data.get("completed", True))
        
    def toYamlData(self):
        """Convert to data to be stored in YAML"""
        return {"type": self.type, "datetime": self.datetime, "completed": self.completed}
    
    def isFull(self):
        return self.type == "full"
        
    def __str__(self):
        return "[Backup: %s %s %s]" % (self.type, self.datetime, self.completed and "complete" or "INCOMPLETE")
    
    def __repr__(self):
        return self.__str__()
    
class InvalidBackupsVersion(Exception):
    def __init__(self, backupRecord, version):
        Exception.__init__(self, "Invalid backup for backup record %s version %d (this version = %d)" %
                           (backupRecord, version, BackupsVersion))
        self.version = version
    
def getBackupsVersion(backupMap, backupRecord):
    versionKey = backupRecord.datetime + "/version"
    if versionKey in backupMap:
        return int(backupMap[versionKey])
    else:
        return 1
        
def checkVersion(backupMap, backupRecord):
    version = getBackupsVersion (backupMap, backupRecord)
    if version != BackupsVersion:
        raise InvalidBackupsVersion (backupRecord, version)
    
class WrittenRecords:
    """Records of where file contents with a given SHA1 hash value was written to in backup map
(within the context of a particular set of backups, i.e. a full and following incrementals)"""
    def __init__(self):
        self.written = {}
        
    def recordHashWritten(self, hash, key):
        """Record that a contents with a particular hash were written to a particular key"""
        print " record hash %s written to %r" % (hash, key)
        self.written[hash] = key
        
    def isWritten(self, hash):
        """Has a file contents with this hash value been written to the backup map?"""
        return hash in self.written
    
    def locationWritten(self, hash):
        """Where a file contents with this hash value was written to"""
        return self.written[hash]
    
    def recordBackup(self, backupMap, backupRecord):
        """For every file contents in a backup record recorded as written, record it's
hash value and backup map key in the written records.""" # todo: slow
        checkVersion(backupMap, backupRecord)
        writtenPathListKey = backupRecord.datetime + "/writtenPathList"
        writtenFileSummariesYamlData = yaml.safe_load (backupMap[writtenPathListKey])
        for fileData in writtenFileSummariesYamlData:
            #print "Recording backup data %s/%r" % (backupRecord.datetime, pathData)
            self.recordHashWritten (fileData["hash"], backupRecord.datetime + fileData["path"])
    
    def recordPreviousBackups(self, backupMap, backupRecords):
        """Record the hashes of all files written from the last full backup onwards (or from the first
backup if for some reason there is no full backup."""
        fullFound = False
        i = len(backupRecords)-1
        while not fullFound and i >= 0:
            backupRecord = backupRecords[i]
            print "Recording backup %r ..." % backupRecord
            self.recordBackup(backupMap, backupRecord)
            if backupRecord.type == "full":
                fullFound = True
            i -= 1
            
class BaseFileHash(object):
    """Description of a file: it's (basic) name and hash"""
    def __init__(self, name, hash, description):
        self.name = name
        self.hash = hash
        self.description = description
        
    def isDir(self):
        return False
            
    def printIndented(self, indent):
        print "%sFile %r: %s" % (indent, self.name, self.hash)
        
    def compareToOtherFileHash (self, otherFileHash, indent, log, logDiff):
        if self.hash != otherFileHash.hash:
            logDiff ("File %r has hash %s in %r but hash %s in %r" %
                     (self.name, self.hash, self.description,
                      otherFileHash.hash, otherFileHash.description))
        
pathRegex = re.compile("[/]([^/]*)([/].*)?")
        
def analysePath(path):
    """Analyse a path starting with '/' and with '/' separators into 1st part and remainder
e.g. '/x/y' into 'x' and '/y' and '/x' into 'x' and None."""
    pathMatch = pathRegex.match(path)
    rootPath = pathMatch.group(1)
    remainderPath = pathMatch.group(2)
    return (rootPath, remainderPath)
 
class BaseDirHash(object):
    """Description of a directory as a map of immediate sub-directories
and immediately contained files"""
    def __init__(self, name, description):
        self.name = name
        self.children = []
        self.childrenMap = {}
        self.description = description
        
    def isDir(self):
        return True
            
    def addChild(self, childHash):
        """Add a child, i.e. a directory or file"""
        self.children.append (childHash)
        self.childrenMap[childHash.name] = childHash
        
    def hasChildNamed(self, childName):
        return childName in self.childrenMap
        
    def printIndented(self, indent = ""):
        print "%sDir %r" % (indent, self.name)
        childIndent = " " + indent
        for child in self.children:
            child.printIndented(indent = childIndent)
            
    def addFileSummary(self, path, hash):
        """Add a file given it's full path name relative to this directory
(necessarily constructing the intermediate sub-directories if they
are not already there)"""
        rootPath, remainderPath = analysePath(path)
        if remainderPath is None:
            self.addChild (BaseFileHash(rootPath, hash, self.description))
        else:
            childDirHash = self.getOrCreateChildDirHash(rootPath)
            childDirHash.addFileSummary (remainderPath, hash)
            
    def getOrCreateChildDirHash(self, name):
        """Return DirHash for an immediate sub-directory, creating it if necessary"""
        if name in self.childrenMap:
            return self.childrenMap[name]
        else:
            childDirHash = BaseDirHash(name, self.description)
            self.addChild(childDirHash)
            return childDirHash
            
    def addDirSummary(self, path):
        """Add a sub-directory given it's full path name relative to this directory
(necessarily constructing the intermediate sub-directories if they
are not already there)"""
        rootPath, remainderPath = analysePath(path)
        if remainderPath is None:
            self.addChild (BaseDirHash(rootPath, self.description))
        else:
            childDirHash = self.getOrCreateChildDirHash(rootPath)
            childDirHash.addDirSummary (remainderPath)
            
    def compareToOtherDirHash(self, otherDirHash, indent, log, logDiff):
        log (indent, "comparing directory %r" % self.name)
        for child1 in self.children:
            name1 = child1.name
            child2 = otherDirHash.childrenMap.get(name1, None)
            if child1.isDir():
                if child2 != None:
                    if not child2.isDir():
                        logDiff ("%r is a directory in %r but a file in %r" %
                                 (name1, self.description, otherDirHash.description))
                    else:
                        child1.compareToOtherDirHash (child2, indent+1, log, logDiff)
                else:
                    logDiff("%r is a directory in %r but does not exist in %r" %
                            (name1, self.description, otherDirHash.description))
            else:
                if child2 != None:
                    if child2.isDir():
                        logDiff("%r is a file in %r but a directory in %r" %
                                (name1, self.description, otherDirHash.description))
                    else:
                        child1.compareToOtherFileHash (child2, indent+1, log, logDiff)
                else:
                    logDiff("%r is a file in %r but does not exist in %r" %
                            (name1, self.description, otherDirHash.description))
            for child2 in otherDirHash.children:
                if not self.hasChildNamed (child2.name):
                    if child2.isDir():
                        logDiff("%r does not exist in %r but is a directory in %r" %
                                (child2.name, self.description, otherDirHash.description))
                    else:
                        logDiff("%r does not exist in %r but is a file in %r" %
                                (child2.name, self.description, otherDirHash.description))
 
class FileHash(BaseFileHash):
    """Information about a file with a relative path name based on actual
contents of actual file in actual file-system base directory"""
    def __init__(self, dir, name, description):
        filename = dir + "/" + name
        content = readFileBytes (filename)
        super(FileHash, self).__init__(name, sha1Digest(content), description)
        
class DirHash(BaseDirHash):
    """Information about files within a directory with a relative path name
based on actual contents of actual directory in actual file-system base directory"""
    def __init__(self, dir, name, description):
        super(DirHash, self).__init__(name, description)
        fullPath = unicode (name and (dir + "/" + name) or dir)
        for childName in os.listdir(fullPath):
            childPath = fullPath + "/" + childName
            if os.path.isfile(childPath):
                self.addChild (FileHash(fullPath, childName, self.description))
            else:
                self.addChild (DirHash(fullPath, childName, self.description))
                
class ContentKey(object):
    def __init__(self, datetime, filePath):
        """Parameters for key used to look up file contents from a particular backup within a backup map.
Note that filePath is expected to start with a '/'"""
        self.datetime = datetime
        self.filePath = filePath
        
    def fileKey(self):
        """The actual key.
Note: "/files" infix is used to allow for other meta-data to be associated with the datetime."""
        return self.datetime + "/files" + self.filePath
    
    def __str__(self):
        return "[%s:%r]" % (self.datetime, self.filePath)
    
    def __repr__(self):
        return self.__str__()
    
class BackupRecordUpdater:
    """Object responsible for recording current state of backup in progress"""
    def __init__(self, backups, backupRecords, currentBackupRecord, backupKeyBase,
                 directoryInfo, recordTrigger = 1000000):
        self.backups = backups
        self.backupRecords = backupRecords
        self.currentBackupRecord = currentBackupRecord
        self.backupKeyBase = backupKeyBase
        self.directoryInfo = directoryInfo
        self.bytesWritten = 0
        self.unrecordedBytes = 0
        self.recordTrigger = recordTrigger
        self.writtenFileSummaries = []
        
    def recordVersion(self):
        self.backups.backupMap[self.backupKeyBase + "/version"] = str(BackupsVersion)
        
    def recordPathSummaries(self):
        self.backups.recordPathSummaries (self.backupKeyBase, self.directoryInfo)
        
    def recordWrittenFileSummaries(self):
        self.backups.recordWrittenFileSummaries (self.backupKeyBase, self.writtenFileSummaries)
        
    def saveBackupRecords(self):
        self.backups.saveBackupRecords(self.backupRecords)
        
    def checkpoint(self):
        self.recordWrittenFileSummaries()
        
    def initialRecord(self):
        self.recordVersion()
        self.recordPathSummaries()
        self.recordWrittenFileSummaries()
        self.saveBackupRecords()
        
    def recordCompleted(self):
        self.currentBackupRecord.completed = True
        self.recordWrittenFileSummaries()
        self.saveBackupRecords()
        
from ThreadedTaskRunner import ThreadedTaskRunner, TaskRunner
 
#taskRunner = TaskRunner(checkpointFreq = 30)
 
taskRunner = ThreadedTaskRunner (checkpointFreq = 500, numThreads = 30)
 
class DeleteBackupMapValueTask:
    def __init__(self, backupMap, key):
        self.backupMap = backupMap
        self.key = key
        
    def getThreadLocals(self):
        return {"backupMap": self.backupMap.clone()}
        
    def doUnsynchronized(self):
        print " delete %r ..." % self.key
        del self.backupMap[self.key]
        
    def doSynchronized(self):
        pass
        
def deleteMapValues(backupMap, dryRun):
    """Delete all keys from a map, or if dryRun is True, do a dry run"""
    print "%sDeleting keys from map %s" % (dryRun and "DRYRUN: " or "", backupMap)
    deleteTasks = []
    for key in backupMap:
        if dryRun:
            print " delete %r ..." % key
        else:
            deleteTasks.append (DeleteBackupMapValueTask(backupMap, key))
    if not dryRun:
        taskRunner.runTasks (deleteTasks)
    print "finished."
    
class IncrementalBackups:
    """A set of dated full or incremental backups within a given backup map.
This object does _not_ (currently) record _where_ the file contents came from.
"""
    def __init__(self, backupMap, recordTrigger = 10000000):
        self.backupMap = backupMap
        self.recordTrigger = recordTrigger
        
    def getDateTimeString(self):
        """Get a date time string to use for a new dated backup"""
        return time.strftime("%Y-%b-%d.%H-%M-%S")
    
    def getBackupRecords(self):
        """Retrieve the BackupRecord objects describing any existing backups"""
        if "backupRecords" in self.backupMap:
            backupsListYamlData = yaml.safe_load(self.backupMap["backupRecords"])
        else:
            backupsListYamlData = []
        return [BackupRecord.fromYamlData(record) for record in backupsListYamlData]
    
    def saveBackupRecords(self, backupRecords):
        backupRecordsYamlData = [record.toYamlData() for record in backupRecords]
        self.backupMap["backupRecords"] = yaml.safe_dump(backupRecordsYamlData)
        print "new backup records = %r" % backupRecords
    
    def getBackupGroups(self):
        """Get backup groups, i.e. backup records grouped into lists of incremental backups with a preceding
full backup."""
        backupGroups = []
        records = self.getBackupRecords()
        currentBackupGroup = []
        for i, record in enumerate(records):
            if record.isFull() or i == 0:
                currentBackupGroup = [record]
                backupGroups.append (currentBackupGroup)
            else:
                currentBackupGroup.append(record)
        return backupGroups
    
    def listBackups(self):
        """Print out list of all backups"""
        recordGroups = self.getBackupGroups()
        for recordGroup in recordGroups:
            for i, record in enumerate(recordGroup):
                if i == 0:
                    indent = "*"
                else:
                    indent = " "
                print "%s%s: %s %s" % (indent, record.type, record.datetime,
                                       record.completed and "complete" or "INCOMPLETE")
                
    def pruneBackup(self, backupRecord, dryRun):
        """Prune the backup indicated by the backup record (with dry-run option)"""
        print " prune backup %r" % backupRecord
        backupSubMap = self.backupMap.subMap(backupRecord.datetime)
        deleteMapValues(backupSubMap, dryRun)
                
    def pruneBackupGroup(self, recordGroup, dryRun):
        """Prune all backups in a backup group (with dry-run option)"""
        print "Backup group to prune: %r" % recordGroup
        for record in recordGroup:
            self.pruneBackup(record, dryRun)
                
    def pruneBackups(self, keep = 1, dryRun = True):
        """Prune previous backup groups, keeping only specified number of most
recent backup groups (but at least one)"""
        print "Pruning backups, keep %d%s" % (keep, dryRun and ", DRY RUN" or "")
        if keep < 1:
            raise Exception ("Number of full backups to keep must be at least 1")
        recordGroups = self.getBackupGroups()
        if keep >= len(recordGroups):
            print "Only %d full backups, and %d specified to keep, so none will be pruned" % (len(recordGroups), keep)
        else:
            numToPrune = len(recordGroups) - keep
            groupsToPrune = recordGroups[:numToPrune]
            for recordGroup in groupsToPrune:
                self.pruneBackupGroup(recordGroup, dryRun = dryRun)
            if not dryRun:
                remainingGroups = recordGroups[numToPrune:]
                remainingRecords = []
                for group in remainingGroups:
                    remainingRecords += group
                self.saveBackupRecords(remainingRecords)
                
    def recordPathSummaries(self, backupKeyBase, directoryInfo):
        pathListKey = backupKeyBase + "/pathList"
        print "Record path summaries to %s ..." % pathListKey
        self.backupMap[pathListKey] = yaml.safe_dump(directoryInfo.getPathSummariesYamlData())
 
    def recordWrittenFileSummaries(self, backupKeyBase, writtenFileSummaries):
        writtenPathListKey = backupKeyBase + "/writtenPathList"
        print "Record written file summaries to %s ..." % writtenPathListKey
        writtenFileSummariesYamlData = [summary.toYamlData() for summary in writtenFileSummaries]
        self.backupMap[writtenPathListKey] = yaml.safe_dump(writtenFileSummariesYamlData)
        
    class BackupFileTask:
        def __init__(self, backupMap, backupFilesKeyBase, pathSummary, fileName, writtenRecords,
                     writtenFileSummaries):
            self.backupMap = backupMap
            self.backupFilesKeyBase = backupFilesKeyBase
            self.pathSummary = pathSummary
            self.fileName = fileName
            self.writtenRecords = writtenRecords
            self.writtenFileSummaries = writtenFileSummaries
            
        def getThreadLocals(self):
            return {"backupMap": self.backupMap.clone()}
        
        def doUnsynchronized(self):
            content = readFileBytes(self.fileName)
            self.fileContentKey = self.backupFilesKeyBase + self.pathSummary.relativePath
            print "Writing %r ..." % self.fileContentKey
            self.backupMap[self.fileContentKey] = content
            
        def doSynchronized(self):
            self.writtenFileSummaries.append (self.pathSummary)
            self.writtenRecords.recordHashWritten (self.pathSummary.hash, self.fileContentKey)
            
    def doBackup(self, directoryInfo, full = True):
        """Create a new backup of a source directory (full or incremental).
Note: 'incremental' is based on comparing the hashes of file contents already marked as
written to previous backups in the same backup group. It is not based on any comparison
of files done on the source computer. If a given file contents has already been written,
then the relevant file written as a pointer to the previous file with the same contents
(which may or may not be the same file in the same place on the source computer).
"""
        dateTimeString = self.getDateTimeString()
        backupKeyBase = dateTimeString
        backupFilesKeyBase = backupKeyBase + "/files"
        print "retrieving existing backup records ..."
        backupRecords = self.getBackupRecords()
        print "backup records = %r" % backupRecords
        currentBackupRecord = BackupRecord(full and "full" or "incremental", dateTimeString, completed = False)
        backupRecords.append(currentBackupRecord)
        backupRecordUpdater = BackupRecordUpdater (self, backupRecords, currentBackupRecord,
                                                   backupKeyBase, directoryInfo, recordTrigger = self.recordTrigger)
        backupRecordUpdater.initialRecord()
        writtenRecords = WrittenRecords()
        if not full:
            if len(backupRecords) == 0:
                full = True
                print "No previous records, so backup will be FULL anyway"
            else:
                writtenRecords.recordPreviousBackups (self.backupMap, backupRecords)
        backupFileTasks = []
        for pathSummary in directoryInfo.pathSummaries:
            if not pathSummary.isDir:
                fileName = pathSummary.fullPath(directoryInfo.path)
                if not writtenRecords.isWritten(pathSummary.hash):
                    backupFileTask = IncrementalBackups.BackupFileTask(self.backupMap, backupFilesKeyBase,
                                                                       pathSummary, fileName, writtenRecords,
                                                                       backupRecordUpdater.writtenFileSummaries)
                    backupFileTasks.append (backupFileTask)
                else:
                    print "Content of %r already written to %r" % (pathSummary,
                                                                   writtenRecords.locationWritten (pathSummary.hash))
        taskRunner.runTasks (backupFileTasks, checkpointTask = backupRecordUpdater)
        backupRecordUpdater.recordCompleted()
        
    def doFullBackup(self, directoryInfo):
        """Do a full backup of a source directory"""
        self.doBackup (directoryInfo, full = True)
        
    def doIncrementalBackup(self, directoryInfo):
        """Do an incremental backup of a source directory"""
        self.doBackup (directoryInfo, full = False)
        
    def getBackupRecordForDateTime(self, backupRecords, dateTimeString):
        for index, backupRecord in enumerate(backupRecords):
            if backupRecord.datetime == dateTimeString:
                return index
        raise "No backup record found for date-time %r" % dateTimeString
        
    def getRestoreRecords(self, backupRecords, dateTimeString):
        """Return records for the most recent backup group"""
        if dateTimeString is None:
            restorePos = len(backupRecords)-1
        else:
            restorePos = self.getBackupRecordForDateTime (backupRecords, dateTimeString)
        pos = restorePos
        while pos >= 0 and backupRecords[pos].type != "full":
            pos -= 1
        return backupRecords[pos:(restorePos+1)]
    
    def getPathSummaryDataList(self, backupRecord):
        """Get YAML data representing information about files and directories backed up
in a specified dated backup"""
        dateTimeString = backupRecord.datetime
        backupKeyBase = dateTimeString
        print "getPathSummaryDataList for %r ..." % backupRecord
        pathSummariesData = yaml.safe_load(self.backupMap[backupKeyBase + "/pathList"])
        return pathSummariesData
    
    def getWrittenFileSummaryDataList(self, backupRecord):
        """Get YAML data representing information about files and directories backed up
in a specified dated backup"""
        dateTimeString = backupRecord.datetime
        backupKeyBase = dateTimeString
        print "getWrittenFileSummaryDataList for %r ..." % backupRecord
        writtenPathListKey = backupKeyBase + "/writtenPathList"
        writtenFileSummariesData = yaml.safe_load(self.backupMap[backupKeyBase + "/writtenPathList"])
        return writtenFileSummariesData
        
    def getHashContentKeyMap(self, restoreRecords, writtenFileSummaryLists):
        """Construct a map from hash keys to the backup keys to which those file contents
were written (within the given backup group which is being restored from)"""
        hashContentKeyMap = {}
        for restoreRecord, writtenFileSummaryList in zip(restoreRecords, writtenFileSummaryLists):
            for writtenFileSummary in writtenFileSummaryList:
                hashContentKeyMap[writtenFileSummary.hash] = ContentKey(restoreRecord.datetime,
                                                                        writtenFileSummary.relativePath)
        return hashContentKeyMap
    
    class RestoreFileTask:
        def __init__(self, backupMap, contentKey, fullPath, updateVerificationRecords, verificationRecords, overwrite):
            self.backupMap = backupMap
            self.contentKey = contentKey
            self.fullPath = fullPath
            self.updateVerificationRecords = updateVerificationRecords
            self.verificationRecords = verificationRecords
            self.overwrite = overwrite
            
        def getThreadLocals(self):
            return {"backupMap": self.backupMap.clone()}
        
        def doUnsynchronized(self):
            content = self.backupMap[self.contentKey.fileKey()]
            if os.path.exists(self.fullPath) and self.overwrite:
                os.remove (self.fullPath)
            writeFileBytes(self.fullPath, content)
            if self.updateVerificationRecords:
                self.contentHash = sha1Digest(content)
            print "Restored FILE %r" % self.fullPath
                    
        def doSynchronized(self):
            if self.updateVerificationRecords:
                self.verificationRecords.markVerified (self.contentKey.datetime,
                                                       self.contentKey.filePath, self.contentHash)
                print "Mark verified FILE %r" % self.fullPath
    
    def restoreDirectory(self, restoreDir, pathSummaryList, hashContentKeyMap, overwrite,
                         updateVerificationRecords = False):
        """Restore a directory using path summaries and hash content key map, with optional overwrite"""
        print "Restoring directory %r ..." % restoreDir
        if updateVerificationRecords:
            verificationRecords = HashVerificationRecords(self.backupMap)
        restoreFileTasks = []
        for pathSummary in pathSummaryList:
            fullPath = pathSummary.fullPath (restoreDir)
            if pathSummary.isDir:
                if not os.path.isdir(fullPath):
                    os.makedirs(fullPath)
                print "Restored DIR %r" % fullPath
            elif pathSummary.isFile:
                if not pathSummary.hash in hashContentKeyMap:
                    print "WARNING: No written content found for %r (hash %s)" % (pathSummary.relativePath,
                                                                                  pathSummary.hash)
                contentKey = hashContentKeyMap[pathSummary.hash]
                restoreFileTasks.append (IncrementalBackups.RestoreFileTask (self.backupMap, contentKey,
                                                                             fullPath, updateVerificationRecords,
                                                                             verificationRecords, overwrite))
            else:
                print "WARNING: Unknown path type %r" % pathSummary
        taskRunner.runTasks (restoreFileTasks)
        if updateVerificationRecords:
            verificationRecords.updateRecords()
            
    def getRestoreDetails(self, dateTimeString):
        backupRecords = self.getBackupRecords()
        print "backupRecords = %r" % backupRecords
        if len(backupRecords) == 0:
            raise "No backup records found"
        print "Get restore records for %s" % (dateTimeString or "(most recent backup)")
        restoreRecords = self.getRestoreRecords(backupRecords, dateTimeString)
        print "restoreRecords = %r" % restoreRecords
        for restoreRecord in restoreRecords:
            print "checkVersion for %r ..." % restoreRecord
            checkVersion(self.backupMap, restoreRecord)
        writtenFileSummaryDataLists = [self.getWrittenFileSummaryDataList(record) for record in restoreRecords]
        print "parsing writtenFileSummaryDataLists from YAML data ..."
        writtenFileSummaryLists = [[PathSummary.fromYamlData(pathSummaryData) for pathSummaryData in pathSummaryDataList]
                                   for pathSummaryDataList in writtenFileSummaryDataLists]
        print "calculating hashContentKeyMap ..."
        hashContentKeyMap = self.getHashContentKeyMap(restoreRecords, writtenFileSummaryLists)
        print "hashContentKeyMap = %r" % hashContentKeyMap
        backupToRestore = restoreRecords[-1]
        print "Target backup for restore: %r" % backupToRestore
        pathSummaryListToRestore = [PathSummary.fromYamlData (pathSummaryData) for pathSummaryData
                                    in self.getPathSummaryDataList(backupToRestore)]
        return pathSummaryListToRestore, hashContentKeyMap, backupToRestore
    
    def getRestoredDirHash(self, dateTimeString = None):
        pathSummaryList, hashContentKeyMap, backupToRestore = self.getRestoreDetails(dateTimeString)
        verificationRecords = HashVerificationRecords(self.backupMap)
        restoredDirHash = BaseDirHash(None, "backed up files")
        for pathSummary in pathSummaryList:
            if pathSummary.isDir:
                restoredDirHash.addDirSummary(pathSummary.relativePath)
                print " DIR %r" % pathSummary.relativePath
            elif pathSummary.isFile:
                contentKey = hashContentKeyMap[pathSummary.hash]
                # We could compare pathSummary.hash and fileHash,
                # but the verified fileHash is what matters (to compare to local file)
                fileHash = verificationRecords.getWrittenFileHash(contentKey.datetime, contentKey.filePath)
                restoredDirHash.addFileSummary(pathSummary.relativePath, fileHash)
                print " FILE %r" % pathSummary.relativePath
            else:
                print "WARNING: Unknown path type %r" % pathSummary
        verificationRecords.updateRecords()
        return restoredDirHash
        
    def incrementalVerify(self, sourceDir):
        """Incrementally verify a directory using path summaries and hash content key map, with optional overwrite"""
        print "Incrementally verifying against directory %r ..." % sourceDir
        restoredDirHash = self.getRestoredDirHash()
        print "RESTORE DIR HASH:"
        restoredDirHash.printIndented()
        print ""
        print "LOCAL DIR HASH for %r" % sourceDir
        localDirHash = DirHash(sourceDir, None, sourceDir)
        localDirHash.printIndented()
        errorDiff = CompareDirectories.ErrorDiff()
        localDirHash.compareToOtherDirHash (restoredDirHash, 0, CompareDirectories.printLog, errorDiff)
        errorDiff.logAndCheck (localDirHash.description, restoredDirHash.description)
            
    def restore(self, restoreDir, dateTimeString = None,
                overwrite = False, updateVerificationRecords = False, allowIncomplete = False):
        """Restore the specified (or otherwise the most recent) backup to a
destination directory (with optional overwrite)"""
        print u"Restoring to %s ..." % restoreDir
        if not os.path.exists(restoreDir):
            os.makedirs(restoreDir)
        if not os.path.isdir(restoreDir):
            raise "Restore target %r is not a directory" % restoreDir
        if not overwrite and len(os.listdir(restoreDir)) > 0:
            raise "Restore target %r is not empty" % restoreDir
        pathSummaryListToRestore, hashContentKeyMap, backupToRestore = self.getRestoreDetails(dateTimeString)
        if not allowIncomplete and not backupToRestore.completed:
            raise "Backup dated %s is not complete and allowIncomplete is set to false" % backupToRestore.datetime
        self.restoreDirectory (restoreDir, pathSummaryListToRestore, hashContentKeyMap,
                               overwrite, updateVerificationRecords)
        print "Restored data to %r" % restoreDir
        
def listBackups(backupMap):
    """List all backups in a backup map"""
    IncrementalBackups(backupMap).listBackups()
        
def pruneBackups(backupMap, keep = 1, dryRun = True):
    """Prune backups in a backup map, keeping specified number of backup groups (minimum 1)"""
    IncrementalBackups(backupMap).pruneBackups(keep = keep, dryRun = dryRun)
 
def doBackup(sourceDirectory, backupMap, testRestoreDir = None, full = False, verify = False,
             doTheBackup = True, verifyIncrementally = False, recordTrigger = 10000000):
    """Do a backup from source directory to backup map, with options 'full' (or incremental)
and 'verify' (in which case a test restore is done to the test restore directory).
Also, if 'doTheBackup' is set to false, only do the test restore and verify.
"""
    startTime = datetime.datetime.now()
    print ""
    print "Started %s" % startTime
    print ""
    if verify and testRestoreDir == None:
        raise "Must supply testRestoreDir argument if verify option is chosen"
    print "Backing up %r ..." % sourceDirectory
    backups = IncrementalBackups(backupMap, recordTrigger)
    srcDirInfo = DirectoryInfo(sourceDirectory)
    if doTheBackup:
        backups.doBackup (srcDirInfo, full = full)
        backupFinishedTime = datetime.datetime.now()
        backupTimeTaken = backupFinishedTime - startTime
        backupFinishedMessage = "Backup finished %s (started %s, took %s)" % (backupFinishedTime,
                                                                              startTime, backupTimeTaken)
        print ""
        print backupFinishedMessage
    restoreStartTime = datetime.datetime.now()
    if verify:
        print ""
        print "Verifying ..."
        if verifyIncrementally:
            print " incrementally ..."
            backups.incrementalVerify (sourceDirectory)
        else:
            print " fully ..."
            print u" removing existing files from %s ..." % testRestoreDir
            shutil.rmtree(testRestoreDir)
            backups.restore(testRestoreDir, overwrite = False, updateVerificationRecords = True)
            CompareDirectories.verifyIdentical(testRestoreDir, srcDirInfo.path)
        verifyFinishedTime = datetime.datetime.now()
        print ""
        if doTheBackup:
            print backupFinishedMessage
        restoreTimeTaken = verifyFinishedTime - restoreStartTime
        print "Verify finished %s (started %s, took %s)" % (verifyFinishedTime, restoreStartTime, restoreTimeTaken)