In [None]:
import org.apache.spark._

# 가장 유명한 마블 히어로 찾기

- marvel-graph.txt: 연속적으로 발생되는 히어로 id 
- marvel-names.txt: 히어로 이름 mapping table

In [27]:
val SUPERHERO_GRAPH = "./data/Marvel-graph.txt"
val SUPERHERO_NAMES = "./data/Marvel-names.txt"

SUPERHERO_GRAPH: String = ./data/Marvel-graph.txt
SUPERHERO_NAMES: String = ./data/Marvel-names.txt


- To do list
    - 히어로 ID별 co-occurance를 계산 (히어로 ID,co-occurance)
    - Key, Value를 바꿈 (co-occurance, 히어로 ID)
    - 가장 높은 co-occurance가진 히어로 ID를 탐색

In [43]:
val names = sc.textFile(SUPERHERO_NAMES)
val graph = sc.textFile(SUPERHERO_GRAPH)

names: org.apache.spark.rdd.RDD[String] = ./data/Marvel-names.txt MapPartitionsRDD[44] at textFile at <console>:28
graph: org.apache.spark.rdd.RDD[String] = ./data/Marvel-graph.txt MapPartitionsRDD[46] at textFile at <console>:29


- names parsing

In [3]:
names.take(10).foreach(println)

1 "24-HOUR MAN/EMMANUEL"
2 "3-D MAN/CHARLES CHAN"
3 "4-D MAN/MERCURIO"
4 "8-BALL/"
5 "A"
6 "A'YIN"
7 "ABBOTT, JACK"
8 "ABCISSA"
9 "ABEL"
10 "ABOMINATION/EMIL BLO"


- 1 "24-HOUR MAN/EMMANUEL"를 ("id(공백)", "히어로이름")으로 parsing 한다고 가정

In [57]:
names.map(line => line.split("\"")).take(10)

res46: Array[Array[String]] = Array(Array("1 ", 24-HOUR MAN/EMMANUEL), Array("2 ", 3-D MAN/CHARLES CHAN), Array("3 ", 4-D MAN/MERCURIO), Array("4 ", 8-BALL/), Array("5 ", A), Array("6 ", A'YIN), Array("7 ", ABBOTT, JACK), Array("8 ", ABCISSA), Array("9 ", ABEL), Array("10 ", ABOMINATION/EMIL BLO))


- 공백 제거전

In [37]:
names.map(line => line.split("\"")).take(10).map(x=> x(0)).take(10)

res27: Array[String] = Array("1 ", "2 ", "3 ", "4 ", "5 ", "6 ", "7 ", "8 ", "9 ", "10 ")


- 공백 제거후 정수형 변환

In [38]:
names.map(line => line.split("\"")).map(x=> x(0).trim().toInt).take(10)

res28: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)


- name 선택

In [42]:
names.map(line => line.split("\"")).map(x=> x(1)).take(10)

res32: Array[String] = Array(24-HOUR MAN/EMMANUEL, 3-D MAN/CHARLES CHAN, 4-D MAN/MERCURIO, 8-BALL/, A, A'YIN, ABBOTT, JACK, ABCISSA, ABEL, ABOMINATION/EMIL BLO)


- 위의 내용들을 아래 함수로 정의
- Option
    - Data가 없을때 None으로 예외처리하기 위해 사용

In [23]:
def parseName(line: String): Option[(Int, String)] = {
    val splitted = line.split("\"")
    if (splitted.length > 1) {
      return Some((splitted(0).trim().toInt, splitted(1)))
    }
    None // flatmap은 none 데이터를 제거함
  }

parseName: (line: String)Option[(Int, String)]


In [24]:
val parsed_names = names.flatMap(parseName)

parsed_names: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[23] at flatMap at <console>:29


In [26]:
parsed_names.take(10)

res18: Array[(Int, String)] = Array((1,24-HOUR MAN/EMMANUEL), (2,3-D MAN/CHARLES CHAN), (3,4-D MAN/MERCURIO), (4,8-BALL/), (5,A), (6,A'YIN), (7,ABBOTT, JACK), (8,ABCISSA), (9,ABEL), (10,ABOMINATION/EMIL BLO))


- graph parsing

In [44]:
graph.take(10).foreach(println)

5988 748 1722 3752 4655 5743 1872 3413 5527 6368 6085 4319 4728 1636 2397 3364 4001 1614 1819 1585 732 2660 3952 2507 3891 2070 2239 2602 612 1352 5447 4548 1596 5488 1605 5517 11 479 2554 2043 17 865 4292 6312 473 534 1479 6375 4456 
5989 4080 4264 4446 3779 2430 2297 6169 3530 3272 4282 6432 2548 4140 185 105 3878 2429 1334 4595 2767 3956 3877 4776 4946 3407 128 269 5775 5121 481 5516 4758 4053 1044 1602 3889 1535 6038 533 3986 
5982 217 595 1194 3308 2940 1815 794 1503 5197 859 5096 6039 2664 651 2244 528 284 1449 1097 1172 1092 108 3405 5204 387 4607 4545 3705 4930 1805 4712 4404 247 4754 4427 1845 536 5795 5978 533 3984 6056 
5983 1165 3836 4361 1282 716 4289 4646 6300 5084 2397 4454 1913 5861 5485 
5980 2731 3712 1587 6084 2472 2546 6313 875 859 323 2664 1469 522 2506 2919 2423 3624 5736 5046 1787 5776 3245 3840 2399 
5981 3569 5353 4087 2653 2058 2218 5354 5306 3135 4088 4869 2958 2959 5732 4076 4155 291 
5986 2658 3712 2650 1265 133 4024 6313 3120 6066 3546 403 545 4860 4337 22

- "\ \s+": 1개 이상의 공백을 제거
- 함수 연산과정을 보여주기 위해 첫번째 문장을 샘플링

In [68]:
val line = graph.map(x=> x.split("\\s+")).take(10)(0)

line: Array[String] = Array(5988, 748, 1722, 3752, 4655, 5743, 1872, 3413, 5527, 6368, 6085, 4319, 4728, 1636, 2397, 3364, 4001, 1614, 1819, 1585, 732, 2660, 3952, 2507, 3891, 2070, 2239, 2602, 612, 1352, 5447, 4548, 1596, 5488, 1605, 5517, 11, 479, 2554, 2043, 17, 865, 4292, 6312, 473, 534, 1479, 6375, 4456)


In [71]:
line(0).toInt

res56: Int = 5988


In [69]:
line.length

res54: Int = 49


- 위 내용을 함수로 정리

In [61]:
def parseGraph(line: String): (Int, Int) = {
    val splitted = line.split("\\s+")
    // (hero ID, number of Connection)
    (splitted(0).toInt, splitted.length - 1)
}

parseGraph: (line: String)(Int, Int)


In [62]:
val parsed_graph = graph.map(parseGraph)

parsed_graph: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[62] at map at <console>:31


In [63]:
parsed_graph.take(10)

res50: Array[(Int, Int)] = Array((5988,48), (5989,40), (5982,42), (5983,14), (5980,24), (5981,17), (5986,142), (5987,81), (5984,41), (5985,19))


- countByKey
    - 히어로 ID별 co-occurance를 계산 (히어로 ID,co-occurance)

In [72]:
//((48+40)+42)+14+...
val reducedGraph = parsed_graph.reduceByKey((x, y) => x + y)

reducedGraph: org.apache.spark.rdd.RDD[(Int, Int)] = ShuffledRDD[66] at reduceByKey at <console>:34


In [77]:
reducedGraph.take(10).foreach(println)

(4904,68)
(1084,263)
(384,21)
(6400,15)
(3702,17)
(6308,89)
(5618,19)
(5354,88)
(1894,8)
(4926,11)


- Key, Value를 바꿈 (co-occurance, 히어로 ID)

In [75]:
val flipped = reducedGraph.map(v => (v._2, v._1))

flipped: org.apache.spark.rdd.RDD[(Int, Int)] = MapPartitionsRDD[67] at map at <console>:35


In [78]:
flipped.take(10).foreach(println)

(68,4904)
(263,1084)
(21,384)
(15,6400)
(17,3702)
(89,6308)
(19,5618)
(88,5354)
(8,1894)
(11,4926)


In [79]:
val mostPopularSuperhero = flipped.max()

mostPopularSuperhero: (Int, Int) = (1933,859)


In [94]:
// tuple._2 <- connection이 가장 많은 히어로 id: 859
val heroName = parsed_names.lookup(mostPopularSuperhero._2).head

heroName: String = CAPTAIN AMERICA
