-
Notifications
You must be signed in to change notification settings - Fork 999
Description
Hello, My Elasticsearch server is behind NGINX. I am using elasticsearch-hadoop 2.1.1 and trying to use python in an Ipython / Jupyter notebook.
When I use an URL of the form https://:@ESHOST// everything works. I am able to use postman or sense to get the data so I know my auth is correct.
I am also able to use elasticsearch-py and see my data but I want to use elasticsearch-hadoop since I want to get Spark to talk to ES eventually.
I have tried setting different es. but I seem to get a connection error when I try to execute
es_rdd = sc.newAPIHadoopRDD(
inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_read_conf)
I have tried multiple combinations of the following (leaving proxy in, leaving proxy out...ssl in/out) but all seem to end in error or timeouts...
es_read_conf = {
"es.nodes" : "ESHOST",
"es.port" : "443",
"es.net.http.auth.user" : "USER",
"es.net.http.auth.pass" : "PASS",
"es.net.ssl" : "true",
"es.net.ssl.cert.allow.self.signed" : "true",
"es.net.proxy.http.host" : "ES_HOST",
"es.net.proxy.http.port" : "443",
"es.resource" : "index/type"
}
This is the error I see in my Jupyter Notebook output cell when I run it..(and I know it works fine from the URL and postman/sense)
org.elasticsearch.hadoop.rest.EsHadoopNoNodesLeftException: Connection error (check network and/or proxy settings)- all nodes failed; tried [[<ESHOST>:443]]
I think I don't have the right es. combination for my environment.
Is there a way to use the URL format in lieu of auth.user / auth.pass?
Any help would be highly appreciated. Thanks
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-62-183322eced5d> in <module>()
3 keyClass="org.apache.hadoop.io.NullWritable",
4 valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
----> 5 conf=es_read_conf)
/home/osboxes/spark-1.5.1/python/pyspark/context.pyc in newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, conf, batchSize)
619 jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
620 valueClass, keyConverter, valueConverter,
--> 621 jconf, batchSize)
622 return RDD(jrdd, self)
623
/home/osboxes/spark-1.5.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/home/osboxes/spark-1.5.1/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/home/osboxes/spark-1.5.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD.
: org.elasticsearch.hadoop.rest.EsHadoopNoNodesLeftException: Connection error (check network and/or proxy settings)- all nodes failed; tried [[ESHOST:443]]
at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:142)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:317)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:301)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:305)
at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:119)
at org.elasticsearch.hadoop.rest.RestClient.discoverNodes(RestClient.java:101)
at org.elasticsearch.hadoop.rest.InitializationUtils.discoverNodesIfNeeded(InitializationUtils.java:58)
at org.elasticsearch.hadoop.rest.RestService.findPartitions(RestService.java:229)
at org.elasticsearch.hadoop.mr.EsInputFormat.getSplits(EsInputFormat.java:457)
at org.elasticsearch.hadoop.mr.EsInputFormat.getSplits(EsInputFormat.java:438)
at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:115)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1277)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.take(RDD.scala:1272)
at org.apache.spark.api.python.SerDeUtil$.pairRDDToPython(SerDeUtil.scala:202)
at org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:530)
at org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor25.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)