Skip to content

Commit 0a0830b

Browse files
Initial Commit
1 parent ea1f788 commit 0a0830b

File tree

6 files changed

+209
-0
lines changed

6 files changed

+209
-0
lines changed

01-HelloSpark-Maven/data/sample.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"Timestamp","Age","Gender","Country","state","self_employed","family_history","treatment","work_interfere","no_employees","remote_work","tech_company","benefits","care_options","wellness_program","seek_help","anonymity","leave","mental_health_consequence","phys_health_consequence","coworkers","supervisor","mental_health_interview","phys_health_interview","mental_vs_physical","obs_consequence","comments"
2+
2014-08-27 11:29:31,37,"Female","United States","IL",NA,"No","Yes","Often","6-25","No","Yes","Yes","Not sure","No","Yes","Yes","Somewhat easy","No","No","Some of them","Yes","No","Maybe","Yes","No",NA
3+
2014-08-27 11:29:37,44,"M","United States","IN",NA,"No","No","Rarely","More than 1000","No","No","Don't know","No","Don't know","Don't know","Don't know","Don't know","Maybe","No","No","No","No","No","Don't know","No",NA
4+
2014-08-27 11:29:44,32,"Male","Canada",NA,NA,"No","No","Rarely","6-25","No","Yes","No","No","No","No","Don't know","Somewhat difficult","No","No","Yes","Yes","Yes","Yes","No","No",NA
5+
2014-08-27 11:29:46,31,"Male","United Kingdom",NA,NA,"Yes","Yes","Often","26-100","No","Yes","No","Yes","No","No","No","Somewhat difficult","Yes","Yes","Some of them","No","Maybe","Maybe","No","Yes",NA
6+
2014-08-27 11:30:22,31,"Male","United States","TX",NA,"No","No","Never","100-500","Yes","Yes","Yes","No","Don't know","Don't know","Don't know","Don't know","No","No","Some of them","Yes","Yes","Yes","Don't know","No",NA
7+
2014-08-27 11:31:22,33,"Male","United States","TN",NA,"Yes","No","Sometimes","6-25","No","Yes","Yes","Not sure","No","Don't know","Don't know","Don't know","No","No","Yes","Yes","No","Maybe","Don't know","No",NA
8+
2014-08-27 11:31:50,35,"Female","United States","MI",NA,"Yes","Yes","Sometimes","1-5","Yes","Yes","No","No","No","No","No","Somewhat difficult","Maybe","Maybe","Some of them","No","No","No","Don't know","No",NA
9+
2014-08-27 11:32:05,39,"M","Canada",NA,NA,"No","No","Never","1-5","Yes","Yes","No","Yes","No","No","Yes","Don't know","No","No","No","No","No","No","No","No",NA
10+
2014-08-27 11:32:39,42,"Female","United States","IL",NA,"Yes","Yes","Sometimes","100-500","No","Yes","Yes","Yes","No","No","No","Very difficult","Maybe","No","Yes","Yes","No","Maybe","No","No",NA

01-HelloSpark-Maven/log4j.properties

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Set everything to be logged to the console
2+
log4j.rootCategory=WARN, console
3+
4+
# define console appender
5+
log4j.appender.console=org.apache.log4j.ConsoleAppender
6+
log4j.appender.console.target=System.out
7+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
8+
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
9+
10+
#application log
11+
log4j.logger.guru.learningjournal.spark.examples=INFO, console, file
12+
log4j.additivity.guru.learningjournal.spark.examples=false
13+
14+
#define rolling file appender
15+
log4j.appender.file=org.apache.log4j.RollingFileAppender
16+
log4j.appender.file.File=${spark.yarn.app.container.log.dir}/${logfile.name}.log
17+
#define following in Java System
18+
# -Dlog4j.configuration=file:log4j.properties
19+
# -Dlogfile.name=hello-spark
20+
# -Dspark.yarn.app.container.log.dir=app-logs
21+
log4j.appender.file.ImmediateFlush=true
22+
log4j.appender.file.Append=false
23+
log4j.appender.file.MaxFileSize=500MB
24+
log4j.appender.file.MaxBackupIndex=2
25+
log4j.appender.file.layout=org.apache.log4j.PatternLayout
26+
log4j.appender.file.layout.conversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
27+
28+
29+
# Recommendations from Spark template
30+
log4j.logger.org.apache.spark.repl.Main=WARN
31+
log4j.logger.org.spark_project.jetty=WARN
32+
log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
33+
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34+
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35+
log4j.logger.org.apache.parquet=ERROR
36+
log4j.logger.parquet=ERROR
37+
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
38+
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
39+

01-HelloSpark-Maven/pom.xml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0"
2+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
4+
<modelVersion>4.0.0</modelVersion>
5+
6+
<groupId>guru.learningjournal.spark.examples</groupId>
7+
<artifactId>HelloSpark</artifactId>
8+
<version>1.0</version>
9+
10+
<properties>
11+
<scala.version>2.12.10</scala.version>
12+
<scala.binary.version>2.12</scala.binary.version>
13+
<spark.version>3.0.0</spark.version>
14+
</properties>
15+
16+
<dependencies>
17+
<dependency>
18+
<groupId>org.scala-lang</groupId>
19+
<artifactId>scala-library</artifactId>
20+
<version>${scala.version}</version>
21+
</dependency>
22+
23+
<dependency>
24+
<groupId>org.apache.spark</groupId>
25+
<artifactId>spark-core_${scala.binary.version}</artifactId>
26+
<version>${spark.version}</version>
27+
</dependency>
28+
29+
<dependency>
30+
<groupId>org.apache.spark</groupId>
31+
<artifactId>spark-sql_${scala.binary.version}</artifactId>
32+
<version>${spark.version}</version>
33+
</dependency>
34+
35+
<dependency>
36+
<groupId>org.scalatest</groupId>
37+
<artifactId>scalatest_${scala.binary.version}</artifactId>
38+
<version>3.0.8</version>
39+
<scope>test</scope>
40+
</dependency>
41+
</dependencies>
42+
43+
<build>
44+
<sourceDirectory>src/main/scala</sourceDirectory>
45+
<testSourceDirectory>src/test/scala</testSourceDirectory>
46+
</build>
47+
48+
</project>

01-HelloSpark-Maven/spark.conf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
spark.app.name = Hello Spark
2+
spark.master = local[3]
3+
spark.sql.shuffle.partitions = 2
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package guru.learningjournal.spark.examples
2+
3+
import java.util.Properties
4+
5+
import scala.language.implicitConversions
6+
import org.apache.log4j.Logger
7+
import org.apache.spark.SparkConf
8+
import org.apache.spark.sql.{DataFrame, SparkSession}
9+
10+
import scala.io.Source
11+
12+
object HelloSpark extends Serializable {
13+
@transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
14+
15+
def main(args: Array[String]): Unit = {
16+
17+
if (args.length == 0) {
18+
logger.error("Usage: HelloSpark filename")
19+
System.exit(1)
20+
}
21+
22+
logger.info("Starting Hello Spark")
23+
val spark = SparkSession.builder()
24+
.config(getSparkAppConf)
25+
.getOrCreate()
26+
//logger.info("spark.conf=" + spark.conf.getAll.toString())
27+
28+
val surveyRawDF = loadSurveyDF(spark, args(0))
29+
val partitionedSurveyDF = surveyRawDF.repartition(2)
30+
val countDF = countByCountry(partitionedSurveyDF)
31+
countDF.foreach(row => {
32+
logger.info("Country: " + row.getString(0) + " Count: " + row.getLong(1))
33+
})
34+
35+
logger.info(countDF.collect().mkString("->"))
36+
37+
logger.info("Finished Hello Spark")
38+
//scala.io.StdIn.readLine()
39+
spark.stop()
40+
}
41+
42+
def countByCountry(surveyDF: DataFrame): DataFrame = {
43+
surveyDF.where("Age < 40")
44+
.select("Age", "Gender", "Country", "state")
45+
.groupBy("Country")
46+
.count()
47+
}
48+
49+
def loadSurveyDF(spark: SparkSession, dataFile: String): DataFrame = {
50+
spark.read
51+
.option("header", "true")
52+
.option("inferSchema", "true")
53+
.csv(dataFile)
54+
}
55+
56+
def getSparkAppConf: SparkConf = {
57+
val sparkAppConf = new SparkConf
58+
//Set all Spark Configs
59+
val props = new Properties
60+
props.load(Source.fromFile("spark.conf").bufferedReader())
61+
props.forEach((k, v) => sparkAppConf.set(k.toString, v.toString))
62+
//This is a fix for Scala 2.11
63+
//import scala.collection.JavaConverters._
64+
//props.asScala.foreach(kv => sparkAppConf.set(kv._1, kv._2))
65+
sparkAppConf
66+
}
67+
68+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package guru.learningjournal.spark.examples
2+
3+
import org.apache.spark.sql.SparkSession
4+
import org.scalatest.{BeforeAndAfterAll, FunSuite}
5+
import guru.learningjournal.spark.examples.HelloSpark.{countByCountry, loadSurveyDF}
6+
7+
import scala.collection.mutable
8+
9+
class HelloSparkTest extends FunSuite with BeforeAndAfterAll {
10+
11+
@transient var spark: SparkSession = _
12+
13+
override def beforeAll(): Unit = {
14+
spark = SparkSession.builder()
15+
.appName("HelloSparkTest")
16+
.master("local[3]")
17+
.getOrCreate()
18+
}
19+
20+
override def afterAll(): Unit = {
21+
spark.stop()
22+
}
23+
24+
test("Data File Loading") {
25+
val sampleDF = loadSurveyDF(spark,"data/sample.csv")
26+
val rCount = sampleDF.count()
27+
assert(rCount==9, " record count should be 9")
28+
}
29+
30+
test("Count by Country"){
31+
val sampleDF = loadSurveyDF(spark,"data/sample.csv" )
32+
val countDF = countByCountry(sampleDF)
33+
val countryMap = new mutable.HashMap[String, Long]
34+
countDF.collect().foreach(r => countryMap.put(r.getString(0), r.getLong(1)))
35+
36+
assert(countryMap("United States") == 4, ":- Count for Unites States should be 6")
37+
assert(countryMap("Canada") == 2, ":- Count for Canada should be 2")
38+
assert(countryMap("United Kingdom") == 1, ":- Count for Unites Kingdom should be 1")
39+
}
40+
41+
}

0 commit comments

Comments
 (0)