@@ -516,7 +516,7 @@ private function lookupPathFileTypes(
516516 $ parent = $ repository_uri .$ parent .'@ ' .$ lookup ['rawCommit ' ];
517517 $ parent = escapeshellarg ($ parent );
518518 $ parents [$ parent ] = true ;
519- $ path_mapping [$ parent ][] = $ path ;
519+ $ path_mapping [$ parent ][] = dirname ( $ path) ;
520520 }
521521
522522 $ result_map = array ();
@@ -592,21 +592,104 @@ private function lookupRecursiveFileList(
592592 $ rev = $ info ['rawCommit ' ];
593593 $ path = $ this ->encodeSVNPath ($ path );
594594
595- // TODO: This is a scalability nightmare.
595+ $ hashkey = md5 ($ repository ->getDetail ('remote-uri ' ).$ path .'@ ' .$ rev );
596+
597+ // This method is quite horrible. The underlying challenge is that some
598+ // commits in the Facebook repository are enormous, taking multiple hours
599+ // to 'ls -R' out of the repository and producing XML files >1GB in size.
600+
601+ // If we try to SimpleXML them, the object exhausts available memory on a
602+ // 64G machine. Instead, cache the XML output and then parse it line by line
603+ // to limit space requirements.
604+
605+ $ cache_loc = sys_get_temp_dir ().'/diffusion. ' .$ hashkey .'.svnls ' ;
606+ if (!Filesystem::pathExists ($ cache_loc )) {
607+ $ tmp = new TempFile ();
608+ execx (
609+ 'svn --non-interactive --xml ls -R %s%s@%d > %s ' ,
610+ $ repository ->getDetail ('remote-uri ' ),
611+ $ path ,
612+ $ rev ,
613+ $ tmp );
614+ execx (
615+ 'mv %s %s ' ,
616+ $ tmp ,
617+ $ cache_loc );
618+ }
619+
620+ $ map = $ this ->parseRecursiveListFileData ($ cache_loc );
621+ Filesystem::remove ($ cache_loc );
596622
597- list ($ raw_xml ) = execx (
598- 'svn --non-interactive --xml ls -R %s%s@%d ' ,
599- $ repository ->getDetail ('remote-uri ' ),
600- $ path ,
601- $ rev );
623+ return $ map ;
624+ }
602625
626+ private function parseRecursiveListFileData ($ file_path ) {
603627 $ map = array ();
604628
605- $ xml = new SimpleXMLElement ($ raw_xml );
606- foreach ($ xml ->list [0 ] as $ entry ) {
607- $ key = (string )$ entry ->name ;
608- $ file_type = $ this ->getFileTypeFromSVNKind ($ entry ['kind ' ]);
609- $ map [$ key ] = $ file_type ;
629+ $ mode = 'xml ' ;
630+ $ done = false ;
631+ $ entry = null ;
632+ foreach (new LinesOfALargeFile ($ file_path ) as $ lno => $ line ) {
633+ switch ($ mode ) {
634+ case 'entry ' :
635+ if ($ line == '</entry> ' ) {
636+ $ entry = implode ('' , $ entry );
637+ $ pattern = '@^\s+kind="(file|dir)"> ' .
638+ '<name>(.*?)</name> ' .
639+ '(<size>(.*?)</size>)?@ ' ;
640+ $ matches = null ;
641+ if (!preg_match ($ pattern , $ entry , $ matches )) {
642+ throw new Exception ("Unable to parse entry! " );
643+ }
644+ $ map [html_entity_decode ($ matches [2 ])] =
645+ $ this ->getFileTypeFromSVNKind ($ matches [1 ]);
646+ $ mode = 'entry-or-end ' ;
647+ } else {
648+ $ entry [] = $ line ;
649+ }
650+ break ;
651+ case 'entry-or-end ' :
652+ if ($ line == '</list> ' ) {
653+ $ done = true ;
654+ break 2 ;
655+ } else if ($ line == '<entry ' ) {
656+ $ mode = 'entry ' ;
657+ $ entry = array ();
658+ } else {
659+ throw new Exception ("Expected </list> or <entry, got {$ line }. " );
660+ }
661+ break ;
662+ case 'xml ' :
663+ $ expect = '<?xml version="1.0"?> ' ;
664+ if ($ line !== $ expect ) {
665+ throw new Exception ("Expected ' {$ expect }', got {$ line }. " );
666+ }
667+ $ mode = 'list ' ;
668+ break ;
669+ case 'list ' :
670+ $ expect = '<lists> ' ;
671+ if ($ line !== $ expect ) {
672+ throw new Exception ("Expected ' {$ expect }', got {$ line }. " );
673+ }
674+ $ mode = 'list1 ' ;
675+ break ;
676+ case 'list1 ' :
677+ $ expect = '<list ' ;
678+ if ($ line !== $ expect ) {
679+ throw new Exception ("Expected ' {$ expect }', got {$ line }. " );
680+ }
681+ $ mode = 'list2 ' ;
682+ break ;
683+ case 'list2 ' :
684+ if (!preg_match ('/^\s+path="/ ' , $ line )) {
685+ throw new Exception ("Expected ' path=...', got {$ line }. " );
686+ }
687+ $ mode = 'entry-or-end ' ;
688+ break ;
689+ }
690+ }
691+ if (!$ done ) {
692+ throw new Exception ("Unexpected end of file. " );
610693 }
611694
612695 return $ map ;
@@ -635,10 +718,3 @@ private function expandAllParentPaths($path, $include_self = false) {
635718 }
636719
637720}
638-
639-
640-
641-
642-
643-
644-
0 commit comments