# Copyright (c) 2008 Philip Dorrell, http://www.1729.com/ # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import yaml import hashlib import os import time import datetime import shutil import CompareDirectories import re from sets import Set def readFileBytes(filename): """Read named file and return contents as a byte string""" f = file(filename, "rb") bytes = f.read() f.close() return bytes def writeFileBytes(filename, bytes): """Write byte string as new contents of named file""" f = file(filename, "wb") f.write(bytes) f.close() BackupsVersion = 2 class PathSummary(object): """Information about a file or directory specified as a relative path within some base directory Note: all paths are '/' separated, whether or not we are in Microsoft Windows""" def __init__(self, relativePath): self.relativePath = relativePath def fullPath(self, basePath): """Return the full path given the path of the base directory""" return basePath + self.relativePath @staticmethod def fromYamlData(data): """Convert YAML data into FileSummary or DirSummary (inverse of toYamlData methods)""" pathType = data["type"] if pathType == "file": return FileSummary.fromYamlData(data) elif pathType == "dir": return DirSummary.fromYamlData(data) else: raise "Unknown path type: %s" % pathType class FileSummary(PathSummary): """Information about a file specified as a relative path within some (unspecified) base directory, including a SHA1 hash of the file's contents.""" def __init__(self, relativePath, hash): super(FileSummary, self).__init__(relativePath) self.isDir = False self.isFile = True self.hash = hash def __unicode__(self): return u"FILE: %r : %s" % (self.relativePath, self.hash) def __repr__(self): return self.__unicode__() def toYamlData(self): """Convert to YAML""" return {"type": "file", "path": self.relativePath, "hash": self.hash } @staticmethod def fromYamlData(data): """Create from YAML (inverse of toYamlData)""" return FileSummary(data["path"], data["hash"]) class DirSummary(PathSummary): """Information about a file specified as a relative path within some (unspecified) base directory""" def __init__(self, relativePath): super(DirSummary, self).__init__(relativePath) self.isDir = True self.isFile = False def __unicode__(self): return u"DIR: %r" % (self.relativePath) def toYamlData(self): """Convert to YAML""" return {"type": "dir", "path": self.relativePath } def __repr__(self): return self.__unicode__() @staticmethod def fromYamlData(data): """Create from YAML (inverse of toYamlData)""" return DirSummary(data["path"]) def sha1Digest(content): return hashlib.sha1(content).hexdigest() class DirectoryInfo: """Information about all the directories and files within a base directory All directories are listed before any subdirectories or files contained within them. """ def __init__(self, path): """Construct from path base directory""" self.path = unicode(path) self.pathSummaries = [] self.summarizeSubDir(u"") def createDirSummary(self, relativePath): """Create a path summary for a sub-directory""" return DirSummary (relativePath) def createFileSummary(self, relativePath): """Create a path summary for a file in the base directory""" fileName = self.path + relativePath content = readFileBytes(fileName) fileHash = sha1Digest(content) return FileSummary (relativePath, fileHash) def addSummary(self, pathSummary): """Add a path summary""" print u"%r" % pathSummary self.pathSummaries.append (pathSummary) def getPathSummariesYamlData(self): """Return array of path summaries as YAML data""" return [summary.toYamlData() for summary in self.pathSummaries] def summarizeSubDir(self, relativePath): """Recursively summarize a sub-directory specified by it's relative path, adding the path summaries for all contained files and sub-directories to the list of path summaries.""" for childName in os.listdir(self.path + relativePath): childRelativePath = relativePath + "/" + childName; childPath = self.path + childRelativePath if os.path.isfile(childPath): self.addSummary(self.createFileSummary(childRelativePath)) elif os.path.isdir(childPath): self.addSummary(self.createDirSummary(childRelativePath)) self.summarizeSubDir (childRelativePath) else: print "UNKNOWN OBJECT %r in %r" % (childName, self.path + relativePath) class HashVerificationRecords(object): """Records of verified hashes of backed up files (i.e. verified by actually reading the file content out of the backup map and recalculating the hash). Note that this class is not yet used, and nothing is yet writing the verification records into the backup map.""" def __init__(self, backupMap): self.backupMap = backupMap self.datetimeFileHashesMap = {} self.datetimeUpdated = Set() def getFileHashesMap(self, datetime): if datetime in self.datetimeFileHashesMap: fileHashesMap = self.datetimeFileHashesMap[datetime] else: fileHashesRecordFilename = datetime + "/verifiedFileHashes.yaml" if fileHashesRecordFilename in self.backupMap: fileHashesMap = yaml.safe_load(self.backupMap[fileHashesRecordFilename]) else: fileHashesMap = {} self.datetimeFileHashesMap[datetime] = fileHashesMap return fileHashesMap def markVerified(self, datetime, filePath, contentHash): fileHashesMap = self.getFileHashesMap(datetime) fileHashesMap[filePath] = contentHash self.datetimeUpdated.add (datetime) def getWrittenFileHash(self, datetime, filePath): """Get the hash of a backed up file, either from an existing hash verification record, or, read the file contents from the backup map and calculate the hash.""" fileHashesMap = self.getFileHashesMap(datetime) if filePath in fileHashesMap: return fileHashesMap[filePath] else: content = self.backupMap[datetime + "/files" + filePath] contentHash = sha1Digest(content) self.markVerified(datetime, filePath, contentHash) return contentHash def updateRecords(self): """Update any newly verified hashes back into the backup map.""" print "Verified hashes were updated for %r" % self.datetimeUpdated for datetime in self.datetimeUpdated: fileHashesRecordFilename = datetime + "/verifiedFileHashes.yaml" print "Updating verification records for %s = %s" % (datetime, self.datetimeFileHashesMap[datetime]) self.backupMap[fileHashesRecordFilename] = yaml.safe_dump (self.datetimeFileHashesMap[datetime]) class BackupRecord: """A record of a backup made: it's date/time, and whether it was full or incremental.""" def __init__(self, type, datetime, completed): """construct from 'full' or 'incremental' and the date time""" self.type = type self.datetime = datetime self.completed = completed @staticmethod def fromYamlData(data): """Construct backup record from YAML data (inverse of toYamlData)""" # completed defaults to True because previous version of keevalback only recorded when complete return BackupRecord(data["type"], data["datetime"], data.get("completed", True)) def toYamlData(self): """Convert to data to be stored in YAML""" return {"type": self.type, "datetime": self.datetime, "completed": self.completed} def isFull(self): return self.type == "full" def __str__(self): return "[Backup: %s %s %s]" % (self.type, self.datetime, self.completed and "complete" or "INCOMPLETE") def __repr__(self): return self.__str__() class InvalidBackupsVersion(Exception): def __init__(self, backupRecord, version): Exception.__init__(self, "Invalid backup for backup record %s version %d (this version = %d)" % (backupRecord, version, BackupsVersion)) self.version = version def getBackupsVersion(backupMap, backupRecord): versionKey = backupRecord.datetime + "/version" if versionKey in backupMap: return int(backupMap[versionKey]) else: return 1 def checkVersion(backupMap, backupRecord): version = getBackupsVersion (backupMap, backupRecord) if version != BackupsVersion: raise InvalidBackupsVersion (backupRecord, version) class WrittenRecords: """Records of where file contents with a given SHA1 hash value was written to in backup map (within the context of a particular set of backups, i.e. a full and following incrementals)""" def __init__(self): self.written = {} def recordHashWritten(self, hash, key): """Record that a contents with a particular hash were written to a particular key""" print " record hash %s written to %r" % (hash, key) self.written[hash] = key def isWritten(self, hash): """Has a file contents with this hash value been written to the backup map?""" return hash in self.written def locationWritten(self, hash): """Where a file contents with this hash value was written to""" return self.written[hash] def recordBackup(self, backupMap, backupRecord): """For every file contents in a backup record recorded as written, record it's hash value and backup map key in the written records.""" # todo: slow checkVersion(backupMap, backupRecord) writtenPathListKey = backupRecord.datetime + "/writtenPathList" writtenFileSummariesYamlData = yaml.safe_load (backupMap[writtenPathListKey]) for fileData in writtenFileSummariesYamlData: #print "Recording backup data %s/%r" % (backupRecord.datetime, pathData) self.recordHashWritten (fileData["hash"], backupRecord.datetime + fileData["path"]) def recordPreviousBackups(self, backupMap, backupRecords): """Record the hashes of all files written from the last full backup onwards (or from the first backup if for some reason there is no full backup.""" fullFound = False i = len(backupRecords)-1 while not fullFound and i >= 0: backupRecord = backupRecords[i] print "Recording backup %r ..." % backupRecord self.recordBackup(backupMap, backupRecord) if backupRecord.type == "full": fullFound = True i -= 1 class BaseFileHash(object): """Description of a file: it's (basic) name and hash""" def __init__(self, name, hash, description): self.name = name self.hash = hash self.description = description def isDir(self): return False def printIndented(self, indent): print "%sFile %r: %s" % (indent, self.name, self.hash) def compareToOtherFileHash (self, otherFileHash, indent, log, logDiff): if self.hash != otherFileHash.hash: logDiff ("File %r has hash %s in %r but hash %s in %r" % (self.name, self.hash, self.description, otherFileHash.hash, otherFileHash.description)) pathRegex = re.compile("[/]([^/]*)([/].*)?") def analysePath(path): """Analyse a path starting with '/' and with '/' separators into 1st part and remainder e.g. '/x/y' into 'x' and '/y' and '/x' into 'x' and None.""" pathMatch = pathRegex.match(path) rootPath = pathMatch.group(1) remainderPath = pathMatch.group(2) return (rootPath, remainderPath) class BaseDirHash(object): """Description of a directory as a map of immediate sub-directories and immediately contained files""" def __init__(self, name, description): self.name = name self.children = [] self.childrenMap = {} self.description = description def isDir(self): return True def addChild(self, childHash): """Add a child, i.e. a directory or file""" self.children.append (childHash) self.childrenMap[childHash.name] = childHash def hasChildNamed(self, childName): return childName in self.childrenMap def printIndented(self, indent = ""): print "%sDir %r" % (indent, self.name) childIndent = " " + indent for child in self.children: child.printIndented(indent = childIndent) def addFileSummary(self, path, hash): """Add a file given it's full path name relative to this directory (necessarily constructing the intermediate sub-directories if they are not already there)""" rootPath, remainderPath = analysePath(path) if remainderPath is None: self.addChild (BaseFileHash(rootPath, hash, self.description)) else: childDirHash = self.getOrCreateChildDirHash(rootPath) childDirHash.addFileSummary (remainderPath, hash) def getOrCreateChildDirHash(self, name): """Return DirHash for an immediate sub-directory, creating it if necessary""" if name in self.childrenMap: return self.childrenMap[name] else: childDirHash = BaseDirHash(name, self.description) self.addChild(childDirHash) return childDirHash def addDirSummary(self, path): """Add a sub-directory given it's full path name relative to this directory (necessarily constructing the intermediate sub-directories if they are not already there)""" rootPath, remainderPath = analysePath(path) if remainderPath is None: self.addChild (BaseDirHash(rootPath, self.description)) else: childDirHash = self.getOrCreateChildDirHash(rootPath) childDirHash.addDirSummary (remainderPath) def compareToOtherDirHash(self, otherDirHash, indent, log, logDiff): log (indent, "comparing directory %r" % self.name) for child1 in self.children: name1 = child1.name child2 = otherDirHash.childrenMap.get(name1, None) if child1.isDir(): if child2 != None: if not child2.isDir(): logDiff ("%r is a directory in %r but a file in %r" % (name1, self.description, otherDirHash.description)) else: child1.compareToOtherDirHash (child2, indent+1, log, logDiff) else: logDiff("%r is a directory in %r but does not exist in %r" % (name1, self.description, otherDirHash.description)) else: if child2 != None: if child2.isDir(): logDiff("%r is a file in %r but a directory in %r" % (name1, self.description, otherDirHash.description)) else: child1.compareToOtherFileHash (child2, indent+1, log, logDiff) else: logDiff("%r is a file in %r but does not exist in %r" % (name1, self.description, otherDirHash.description)) for child2 in otherDirHash.children: if not self.hasChildNamed (child2.name): if child2.isDir(): logDiff("%r does not exist in %r but is a directory in %r" % (child2.name, self.description, otherDirHash.description)) else: logDiff("%r does not exist in %r but is a file in %r" % (child2.name, self.description, otherDirHash.description)) class FileHash(BaseFileHash): """Information about a file with a relative path name based on actual contents of actual file in actual file-system base directory""" def __init__(self, dir, name, description): filename = dir + "/" + name content = readFileBytes (filename) super(FileHash, self).__init__(name, sha1Digest(content), description) class DirHash(BaseDirHash): """Information about files within a directory with a relative path name based on actual contents of actual directory in actual file-system base directory""" def __init__(self, dir, name, description): super(DirHash, self).__init__(name, description) fullPath = unicode (name and (dir + "/" + name) or dir) for childName in os.listdir(fullPath): childPath = fullPath + "/" + childName if os.path.isfile(childPath): self.addChild (FileHash(fullPath, childName, self.description)) else: self.addChild (DirHash(fullPath, childName, self.description)) class ContentKey(object): def __init__(self, datetime, filePath): """Parameters for key used to look up file contents from a particular backup within a backup map. Note that filePath is expected to start with a '/'""" self.datetime = datetime self.filePath = filePath def fileKey(self): """The actual key. Note: "/files" infix is used to allow for other meta-data to be associated with the datetime.""" return self.datetime + "/files" + self.filePath def __str__(self): return "[%s:%r]" % (self.datetime, self.filePath) def __repr__(self): return self.__str__() class BackupRecordUpdater: """Object responsible for recording current state of backup in progress""" def __init__(self, backups, backupRecords, currentBackupRecord, backupKeyBase, directoryInfo, recordTrigger = 1000000): self.backups = backups self.backupRecords = backupRecords self.currentBackupRecord = currentBackupRecord self.backupKeyBase = backupKeyBase self.directoryInfo = directoryInfo self.bytesWritten = 0 self.unrecordedBytes = 0 self.recordTrigger = recordTrigger self.writtenFileSummaries = [] def recordVersion(self): self.backups.backupMap[self.backupKeyBase + "/version"] = str(BackupsVersion) def recordPathSummaries(self): self.backups.recordPathSummaries (self.backupKeyBase, self.directoryInfo) def recordWrittenFileSummaries(self): self.backups.recordWrittenFileSummaries (self.backupKeyBase, self.writtenFileSummaries) def saveBackupRecords(self): self.backups.saveBackupRecords(self.backupRecords) def checkpoint(self): self.recordWrittenFileSummaries() def initialRecord(self): self.recordVersion() self.recordPathSummaries() self.recordWrittenFileSummaries() self.saveBackupRecords() def recordCompleted(self): self.currentBackupRecord.completed = True self.recordWrittenFileSummaries() self.saveBackupRecords() from ThreadedTaskRunner import ThreadedTaskRunner, TaskRunner #taskRunner = TaskRunner(checkpointFreq = 30) taskRunner = ThreadedTaskRunner (checkpointFreq = 500, numThreads = 30) class DeleteBackupMapValueTask: def __init__(self, backupMap, key): self.backupMap = backupMap self.key = key def getThreadLocals(self): return {"backupMap": self.backupMap.clone()} def doUnsynchronized(self): print " delete %r ..." % self.key del self.backupMap[self.key] def doSynchronized(self): pass def deleteMapValues(backupMap, dryRun): """Delete all keys from a map, or if dryRun is True, do a dry run""" print "%sDeleting keys from map %s" % (dryRun and "DRYRUN: " or "", backupMap) deleteTasks = [] for key in backupMap: if dryRun: print " delete %r ..." % key else: deleteTasks.append (DeleteBackupMapValueTask(backupMap, key)) if not dryRun: taskRunner.runTasks (deleteTasks) print "finished." class IncrementalBackups: """A set of dated full or incremental backups within a given backup map. This object does _not_ (currently) record _where_ the file contents came from. """ def __init__(self, backupMap, recordTrigger = 10000000): self.backupMap = backupMap self.recordTrigger = recordTrigger def getDateTimeString(self): """Get a date time string to use for a new dated backup""" return time.strftime("%Y-%b-%d.%H-%M-%S") def getBackupRecords(self): """Retrieve the BackupRecord objects describing any existing backups""" if "backupRecords" in self.backupMap: backupsListYamlData = yaml.safe_load(self.backupMap["backupRecords"]) else: backupsListYamlData = [] return [BackupRecord.fromYamlData(record) for record in backupsListYamlData] def saveBackupRecords(self, backupRecords): backupRecordsYamlData = [record.toYamlData() for record in backupRecords] self.backupMap["backupRecords"] = yaml.safe_dump(backupRecordsYamlData) print "new backup records = %r" % backupRecords def getBackupGroups(self): """Get backup groups, i.e. backup records grouped into lists of incremental backups with a preceding full backup.""" backupGroups = [] records = self.getBackupRecords() currentBackupGroup = [] for i, record in enumerate(records): if record.isFull() or i == 0: currentBackupGroup = [record] backupGroups.append (currentBackupGroup) else: currentBackupGroup.append(record) return backupGroups def listBackups(self): """Print out list of all backups""" recordGroups = self.getBackupGroups() for recordGroup in recordGroups: for i, record in enumerate(recordGroup): if i == 0: indent = "*" else: indent = " " print "%s%s: %s %s" % (indent, record.type, record.datetime, record.completed and "complete" or "INCOMPLETE") def pruneBackup(self, backupRecord, dryRun): """Prune the backup indicated by the backup record (with dry-run option)""" print " prune backup %r" % backupRecord backupSubMap = self.backupMap.subMap(backupRecord.datetime) deleteMapValues(backupSubMap, dryRun) def pruneBackupGroup(self, recordGroup, dryRun): """Prune all backups in a backup group (with dry-run option)""" print "Backup group to prune: %r" % recordGroup for record in recordGroup: self.pruneBackup(record, dryRun) def pruneBackups(self, keep = 1, dryRun = True): """Prune previous backup groups, keeping only specified number of most recent backup groups (but at least one)""" print "Pruning backups, keep %d%s" % (keep, dryRun and ", DRY RUN" or "") if keep < 1: raise Exception ("Number of full backups to keep must be at least 1") recordGroups = self.getBackupGroups() if keep >= len(recordGroups): print "Only %d full backups, and %d specified to keep, so none will be pruned" % (len(recordGroups), keep) else: numToPrune = len(recordGroups) - keep groupsToPrune = recordGroups[:numToPrune] for recordGroup in groupsToPrune: self.pruneBackupGroup(recordGroup, dryRun = dryRun) if not dryRun: remainingGroups = recordGroups[numToPrune:] remainingRecords = [] for group in remainingGroups: remainingRecords += group self.saveBackupRecords(remainingRecords) def recordPathSummaries(self, backupKeyBase, directoryInfo): pathListKey = backupKeyBase + "/pathList" print "Record path summaries to %s ..." % pathListKey self.backupMap[pathListKey] = yaml.safe_dump(directoryInfo.getPathSummariesYamlData()) def recordWrittenFileSummaries(self, backupKeyBase, writtenFileSummaries): writtenPathListKey = backupKeyBase + "/writtenPathList" print "Record written file summaries to %s ..." % writtenPathListKey writtenFileSummariesYamlData = [summary.toYamlData() for summary in writtenFileSummaries] self.backupMap[writtenPathListKey] = yaml.safe_dump(writtenFileSummariesYamlData) class BackupFileTask: def __init__(self, backupMap, backupFilesKeyBase, pathSummary, fileName, writtenRecords, writtenFileSummaries): self.backupMap = backupMap self.backupFilesKeyBase = backupFilesKeyBase self.pathSummary = pathSummary self.fileName = fileName self.writtenRecords = writtenRecords self.writtenFileSummaries = writtenFileSummaries def getThreadLocals(self): return {"backupMap": self.backupMap.clone()} def doUnsynchronized(self): content = readFileBytes(self.fileName) self.fileContentKey = self.backupFilesKeyBase + self.pathSummary.relativePath print "Writing %r ..." % self.fileContentKey self.backupMap[self.fileContentKey] = content def doSynchronized(self): self.writtenFileSummaries.append (self.pathSummary) self.writtenRecords.recordHashWritten (self.pathSummary.hash, self.fileContentKey) def doBackup(self, directoryInfo, full = True): """Create a new backup of a source directory (full or incremental). Note: 'incremental' is based on comparing the hashes of file contents already marked as written to previous backups in the same backup group. It is not based on any comparison of files done on the source computer. If a given file contents has already been written, then the relevant file written as a pointer to the previous file with the same contents (which may or may not be the same file in the same place on the source computer). """ dateTimeString = self.getDateTimeString() backupKeyBase = dateTimeString backupFilesKeyBase = backupKeyBase + "/files" print "retrieving existing backup records ..." backupRecords = self.getBackupRecords() print "backup records = %r" % backupRecords currentBackupRecord = BackupRecord(full and "full" or "incremental", dateTimeString, completed = False) backupRecords.append(currentBackupRecord) backupRecordUpdater = BackupRecordUpdater (self, backupRecords, currentBackupRecord, backupKeyBase, directoryInfo, recordTrigger = self.recordTrigger) backupRecordUpdater.initialRecord() writtenRecords = WrittenRecords() if not full: if len(backupRecords) == 0: full = True print "No previous records, so backup will be FULL anyway" else: writtenRecords.recordPreviousBackups (self.backupMap, backupRecords) backupFileTasks = [] for pathSummary in directoryInfo.pathSummaries: if not pathSummary.isDir: fileName = pathSummary.fullPath(directoryInfo.path) if not writtenRecords.isWritten(pathSummary.hash): backupFileTask = IncrementalBackups.BackupFileTask(self.backupMap, backupFilesKeyBase, pathSummary, fileName, writtenRecords, backupRecordUpdater.writtenFileSummaries) backupFileTasks.append (backupFileTask) else: print "Content of %r already written to %r" % (pathSummary, writtenRecords.locationWritten (pathSummary.hash)) taskRunner.runTasks (backupFileTasks, checkpointTask = backupRecordUpdater) backupRecordUpdater.recordCompleted() def doFullBackup(self, directoryInfo): """Do a full backup of a source directory""" self.doBackup (directoryInfo, full = True) def doIncrementalBackup(self, directoryInfo): """Do an incremental backup of a source directory""" self.doBackup (directoryInfo, full = False) def getBackupRecordForDateTime(self, backupRecords, dateTimeString): for index, backupRecord in enumerate(backupRecords): if backupRecord.datetime == dateTimeString: return index raise "No backup record found for date-time %r" % dateTimeString def getRestoreRecords(self, backupRecords, dateTimeString): """Return records for the most recent backup group""" if dateTimeString is None: restorePos = len(backupRecords)-1 else: restorePos = self.getBackupRecordForDateTime (backupRecords, dateTimeString) pos = restorePos while pos >= 0 and backupRecords[pos].type != "full": pos -= 1 return backupRecords[pos:(restorePos+1)] def getPathSummaryDataList(self, backupRecord): """Get YAML data representing information about files and directories backed up in a specified dated backup""" dateTimeString = backupRecord.datetime backupKeyBase = dateTimeString print "getPathSummaryDataList for %r ..." % backupRecord pathSummariesData = yaml.safe_load(self.backupMap[backupKeyBase + "/pathList"]) return pathSummariesData def getWrittenFileSummaryDataList(self, backupRecord): """Get YAML data representing information about files and directories backed up in a specified dated backup""" dateTimeString = backupRecord.datetime backupKeyBase = dateTimeString print "getWrittenFileSummaryDataList for %r ..." % backupRecord writtenPathListKey = backupKeyBase + "/writtenPathList" writtenFileSummariesData = yaml.safe_load(self.backupMap[backupKeyBase + "/writtenPathList"]) return writtenFileSummariesData def getHashContentKeyMap(self, restoreRecords, writtenFileSummaryLists): """Construct a map from hash keys to the backup keys to which those file contents were written (within the given backup group which is being restored from)""" hashContentKeyMap = {} for restoreRecord, writtenFileSummaryList in zip(restoreRecords, writtenFileSummaryLists): for writtenFileSummary in writtenFileSummaryList: hashContentKeyMap[writtenFileSummary.hash] = ContentKey(restoreRecord.datetime, writtenFileSummary.relativePath) return hashContentKeyMap class RestoreFileTask: def __init__(self, backupMap, contentKey, fullPath, updateVerificationRecords, verificationRecords, overwrite): self.backupMap = backupMap self.contentKey = contentKey self.fullPath = fullPath self.updateVerificationRecords = updateVerificationRecords self.verificationRecords = verificationRecords self.overwrite = overwrite def getThreadLocals(self): return {"backupMap": self.backupMap.clone()} def doUnsynchronized(self): content = self.backupMap[self.contentKey.fileKey()] if os.path.exists(self.fullPath) and self.overwrite: os.remove (self.fullPath) writeFileBytes(self.fullPath, content) if self.updateVerificationRecords: self.contentHash = sha1Digest(content) print "Restored FILE %r" % self.fullPath def doSynchronized(self): if self.updateVerificationRecords: self.verificationRecords.markVerified (self.contentKey.datetime, self.contentKey.filePath, self.contentHash) print "Mark verified FILE %r" % self.fullPath def restoreDirectory(self, restoreDir, pathSummaryList, hashContentKeyMap, overwrite, updateVerificationRecords = False): """Restore a directory using path summaries and hash content key map, with optional overwrite""" print "Restoring directory %r ..." % restoreDir if updateVerificationRecords: verificationRecords = HashVerificationRecords(self.backupMap) restoreFileTasks = [] for pathSummary in pathSummaryList: fullPath = pathSummary.fullPath (restoreDir) if pathSummary.isDir: if not os.path.isdir(fullPath): os.makedirs(fullPath) print "Restored DIR %r" % fullPath elif pathSummary.isFile: if not pathSummary.hash in hashContentKeyMap: print "WARNING: No written content found for %r (hash %s)" % (pathSummary.relativePath, pathSummary.hash) contentKey = hashContentKeyMap[pathSummary.hash] restoreFileTasks.append (IncrementalBackups.RestoreFileTask (self.backupMap, contentKey, fullPath, updateVerificationRecords, verificationRecords, overwrite)) else: print "WARNING: Unknown path type %r" % pathSummary taskRunner.runTasks (restoreFileTasks) if updateVerificationRecords: verificationRecords.updateRecords() def getRestoreDetails(self, dateTimeString): backupRecords = self.getBackupRecords() print "backupRecords = %r" % backupRecords if len(backupRecords) == 0: raise "No backup records found" print "Get restore records for %s" % (dateTimeString or "(most recent backup)") restoreRecords = self.getRestoreRecords(backupRecords, dateTimeString) print "restoreRecords = %r" % restoreRecords for restoreRecord in restoreRecords: print "checkVersion for %r ..." % restoreRecord checkVersion(self.backupMap, restoreRecord) writtenFileSummaryDataLists = [self.getWrittenFileSummaryDataList(record) for record in restoreRecords] print "parsing writtenFileSummaryDataLists from YAML data ..." writtenFileSummaryLists = [[PathSummary.fromYamlData(pathSummaryData) for pathSummaryData in pathSummaryDataList] for pathSummaryDataList in writtenFileSummaryDataLists] print "calculating hashContentKeyMap ..." hashContentKeyMap = self.getHashContentKeyMap(restoreRecords, writtenFileSummaryLists) print "hashContentKeyMap = %r" % hashContentKeyMap backupToRestore = restoreRecords[-1] print "Target backup for restore: %r" % backupToRestore pathSummaryListToRestore = [PathSummary.fromYamlData (pathSummaryData) for pathSummaryData in self.getPathSummaryDataList(backupToRestore)] return pathSummaryListToRestore, hashContentKeyMap, backupToRestore def getRestoredDirHash(self, dateTimeString = None): pathSummaryList, hashContentKeyMap, backupToRestore = self.getRestoreDetails(dateTimeString) verificationRecords = HashVerificationRecords(self.backupMap) restoredDirHash = BaseDirHash(None, "backed up files") for pathSummary in pathSummaryList: if pathSummary.isDir: restoredDirHash.addDirSummary(pathSummary.relativePath) print " DIR %r" % pathSummary.relativePath elif pathSummary.isFile: contentKey = hashContentKeyMap[pathSummary.hash] # We could compare pathSummary.hash and fileHash, # but the verified fileHash is what matters (to compare to local file) fileHash = verificationRecords.getWrittenFileHash(contentKey.datetime, contentKey.filePath) restoredDirHash.addFileSummary(pathSummary.relativePath, fileHash) print " FILE %r" % pathSummary.relativePath else: print "WARNING: Unknown path type %r" % pathSummary verificationRecords.updateRecords() return restoredDirHash def incrementalVerify(self, sourceDir): """Incrementally verify a directory using path summaries and hash content key map, with optional overwrite""" print "Incrementally verifying against directory %r ..." % sourceDir restoredDirHash = self.getRestoredDirHash() print "RESTORE DIR HASH:" restoredDirHash.printIndented() print "" print "LOCAL DIR HASH for %r" % sourceDir localDirHash = DirHash(sourceDir, None, sourceDir) localDirHash.printIndented() errorDiff = CompareDirectories.ErrorDiff() localDirHash.compareToOtherDirHash (restoredDirHash, 0, CompareDirectories.printLog, errorDiff) errorDiff.logAndCheck (localDirHash.description, restoredDirHash.description) def restore(self, restoreDir, dateTimeString = None, overwrite = False, updateVerificationRecords = False, allowIncomplete = False): """Restore the specified (or otherwise the most recent) backup to a destination directory (with optional overwrite)""" print u"Restoring to %s ..." % restoreDir if not os.path.exists(restoreDir): os.makedirs(restoreDir) if not os.path.isdir(restoreDir): raise "Restore target %r is not a directory" % restoreDir if not overwrite and len(os.listdir(restoreDir)) > 0: raise "Restore target %r is not empty" % restoreDir pathSummaryListToRestore, hashContentKeyMap, backupToRestore = self.getRestoreDetails(dateTimeString) if not allowIncomplete and not backupToRestore.completed: raise "Backup dated %s is not complete and allowIncomplete is set to false" % backupToRestore.datetime self.restoreDirectory (restoreDir, pathSummaryListToRestore, hashContentKeyMap, overwrite, updateVerificationRecords) print "Restored data to %r" % restoreDir def listBackups(backupMap): """List all backups in a backup map""" IncrementalBackups(backupMap).listBackups() def pruneBackups(backupMap, keep = 1, dryRun = True): """Prune backups in a backup map, keeping specified number of backup groups (minimum 1)""" IncrementalBackups(backupMap).pruneBackups(keep = keep, dryRun = dryRun) def doBackup(sourceDirectory, backupMap, testRestoreDir = None, full = False, verify = False, doTheBackup = True, verifyIncrementally = False, recordTrigger = 10000000): """Do a backup from source directory to backup map, with options 'full' (or incremental) and 'verify' (in which case a test restore is done to the test restore directory). Also, if 'doTheBackup' is set to false, only do the test restore and verify. """ startTime = datetime.datetime.now() print "" print "Started %s" % startTime print "" if verify and testRestoreDir == None: raise "Must supply testRestoreDir argument if verify option is chosen" print "Backing up %r ..." % sourceDirectory backups = IncrementalBackups(backupMap, recordTrigger) srcDirInfo = DirectoryInfo(sourceDirectory) if doTheBackup: backups.doBackup (srcDirInfo, full = full) backupFinishedTime = datetime.datetime.now() backupTimeTaken = backupFinishedTime - startTime backupFinishedMessage = "Backup finished %s (started %s, took %s)" % (backupFinishedTime, startTime, backupTimeTaken) print "" print backupFinishedMessage restoreStartTime = datetime.datetime.now() if verify: print "" print "Verifying ..." if verifyIncrementally: print " incrementally ..." backups.incrementalVerify (sourceDirectory) else: print " fully ..." print u" removing existing files from %s ..." % testRestoreDir shutil.rmtree(testRestoreDir) backups.restore(testRestoreDir, overwrite = False, updateVerificationRecords = True) CompareDirectories.verifyIdentical(testRestoreDir, srcDirInfo.path) verifyFinishedTime = datetime.datetime.now() print "" if doTheBackup: print backupFinishedMessage restoreTimeTaken = verifyFinishedTime - restoreStartTime print "Verify finished %s (started %s, took %s)" % (verifyFinishedTime, restoreStartTime, restoreTimeTaken)