In [9]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType,StructField,DoubleType, StringType, IntegerType
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

conf = SparkConf().setAppName("Project Q2").set("spark.jars", "/data/lab/mysql-connector-j-8.4.0.jar")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [10]:
##构建dataframe的schema
schema = StructType([
  StructField('customerID', StringType()),
  StructField('gender', StringType()),
  StructField('seniorCitizen', DoubleType()),
  StructField('partner', StringType()),
  StructField('dependents', StringType()),
  StructField('tenure', DoubleType()),
  StructField('phoneService', StringType()),
  StructField('multipleLines', StringType()),
  StructField('internetService', StringType()), 
  StructField('onlineSecurity', StringType()),
  StructField('onlineBackup', StringType()),
  StructField('deviceProtection', StringType()),
  StructField('techSupport', StringType()),
  StructField('streamingTV', StringType()),
  StructField('streamingMovies', StringType()),
  StructField('contract', StringType()),
  StructField('paperlessBilling', StringType()),
  StructField('paymentMethod', StringType()),
  StructField('monthlyCharges', DoubleType()),
  StructField('totalCharges', DoubleType()),
  StructField('churnString', StringType())
  ])

In [11]:
##把原始文件导入
bronze_path = "Telco-Customer-Churn.csv"
bronze_df = spark.read.format('csv').schema(schema).option('header','true').load(bronze_path)

In [12]:
#把原始文件处理成新文件，并写出
silver_df = bronze_df.withColumn('churn',when(col('churnString') == 'Yes',1).when(col('churnString') == 'No',0).otherwise('Unknown'))\
                     .drop('churnString').filter(col('contract') == 'Month-to-month')\
                     .filter(col('internetService') != 'No')

# 将 Spark DataFrame 转换为 Pandas DataFrame,以方便输出成含有表头的csv表格
silver_pd = silver_df.toPandas()

# 使用 Pandas 写入 CSV 文件
silver_pd.to_csv("silver_data.csv", index=False)

25/04/12 09:47:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn
 Schema: customerID, gender, seniorCitizen, partner, dependents, tenure, phoneService, multipleLines, internetService, onlineSecurity, onlineBackup, deviceProtection, techSupport, streamingTV, streamingMovies, contract, paperlessBilling, paymentMethod, monthlyCharges, totalCharges, churnString
Expected: churnString but found: Churn
CSV file: file:///data/lab/Project/Q2_tries/Telco-Customer-Churn.csv


In [15]:
silver_data = spark.read.csv("silver_data.csv")
silver_data.show(5)

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+----------------+--------------+------------+-----+
|       _c0|   _c1|          _c2|    _c3|       _c4|   _c5|         _c6|             _c7|            _c8|           _c9|        _c10|            _c11|       _c12|       _c13|           _c14|          _c15|            _c16|            _c17|          _c18|        _c19| _c20|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+----------------+--------------+------------+-----+
|customerID|gender|seniorCitizen|partner|dependents|tenure|phoneService|   multipleLines|internetService|onlineSecurity|onlineBackup|deviceProtection|techSupport|streamingTV|stre