In [7]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *


sqlContext = SQLContext(sc)
schema = StructType([StructField("state", StringType(), True),
                     StructField("account_length", DoubleType(), True),
                     StructField("area_code", StringType(), True),
                     StructField("phone_number", StringType(), True),
                     StructField("international_plan", StringType(), True),
                     StructField("voice_mail_plan", StringType(), True),
                     StructField("number_vmail_messages", DoubleType(), True),
                     StructField("total_day_minutes", DoubleType(), True),
                     StructField("total_day_calls", DoubleType(), True),
                     StructField("total_day_charge", DoubleType(), True),
                     StructField("total_eve_minutes", DoubleType(), True),
                     StructField("total_eve_calls", DoubleType(), True),
                     StructField("total_eve_charge", DoubleType(), True),
                     StructField("total_night_minutes", DoubleType(), True),
                     StructField("total_night_calls", DoubleType(), True),
                     StructField("total_night_charge", DoubleType(), True),
                     StructField("total_intl_minutes", DoubleType(), True),
                     StructField("total_intl_calls", DoubleType(), True),
                     StructField("total_intl_charge", DoubleType(), True),
                     StructField("number_customer_service_calls", DoubleType(), True),
                     StructField("churned", StringType(), True)])

df = sqlContext.read.format('com.databricks.spark.csv').load('churn.all', schema = schema)
df.take(1)

[Row(state='KS', account_length=128.0, area_code=' 415', phone_number=' 382-4657', international_plan=' no', voice_mail_plan=' yes', number_vmail_messages=25.0, total_day_minutes=265.1, total_day_calls=110.0, total_day_charge=45.07, total_eve_minutes=197.4, total_eve_calls=99.0, total_eve_charge=16.78, total_night_minutes=244.7, total_night_calls=91.0, total_night_charge=11.01, total_intl_minutes=10.0, total_intl_calls=3.0, total_intl_charge=2.7, number_customer_service_calls=1.0, churned=' False.')]

In [9]:
# Assemble feature vectors

from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols = [
        'number_customer_service_calls', \
        'total_night_minutes', \
        'total_day_minutes', \
        'total_eve_minutes', \
        'account_length'],
    outputCol = 'features')

# Transform labels
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol = 'churned', outputCol = 'label')

In [17]:
# Fit the model

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')

pipeline = Pipeline(stages=[assembler, label_indexer, classifier])

(train, test) = df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(train)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8644895984518621