# <img src ='https://airsblobstorage.blob.core.windows.net/airstream/Asset 275.png' width="50px"> Basic Aggregations and Transformations

We'll take a look at aggregation operations that you can perform using Spark.

In [0]:
df1 = spark.read.format("csv").load("dbfs:/FileStore/PLSampleDataSets/credit_train.csv")


In [0]:
## TODO Recording: In the cell below expand the churn_data and show that we now have sensible headers

In [0]:
credit_data = spark.read.format("csv") \
                        .option("inferSchema", "True") \
                        .option("header", "True") \
                        .option("sep", ",") \
                        .load("dbfs:/FileStore/PLSampleDataSets/credit_train.csv")

In [0]:
credit_data.printSchema()

In [0]:
credit_data.show(15)

In [0]:
credit_data.count()

In [0]:
credit_data_subset = credit_data.select('Loan Status', 'Current Loan Amount', 'Annual Income')

In [0]:
credit_data_subset.show()

In [0]:
credit_data.select('Loan Status', 'Current Loan Amount', 'Annual Income')\
           .limit(10)\
           .show()

In [0]:
credit_data.filter(credit_data['Annual Income'].isNull()) \
           .show()

In [0]:
credit_data = credit_data.dropna()

credit_data.count()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Home Ownership', 'Bankruptcies')\
           .where(credit_data.Bankruptcies > 0) \
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Home Ownership', 'Tax Liens')\
           .filter((credit_data['Annual Income'] > 100000) & (credit_data['Tax Liens'] > 0)) \
           .show()

In [0]:
credit_data.select('Customer ID', 'Home Ownership')\
           .filter(credit_data['Home Ownership'].isin(['Home Mortgage', 'Rent'])) \
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Monthly Debt')\
           .withColumnRenamed('Annual Income', 'Income')\
           .withColumnRenamed('Monthly Debt', 'Monthly Debt Payment')\
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Monthly Debt')\
           .withColumn('Savings', credit_data['Annual Income'] - 12 * credit_data['Monthly Debt'])\
           .show()

In [0]:
credit_data.select('Customer ID', 'Annual Income', 'Monthly Debt')\
           .orderBy(credit_data['Monthly Debt'].desc())\
           .show()

In [0]:
credit_data.select('Customer ID', 'Years of Credit History', 'Years in current job')\
           .orderBy(credit_data['Years of Credit History'].asc())\
           .show()

In [0]:
credit_data.groupBy('Loan Status').count().show()

In [0]:
credit_data.groupBy('Purpose').agg({'Current Loan Amount': 'sum'}).show()

In [0]:
credit_data.groupBy('Purpose').agg({'Current Loan Amount': 'min'}).show()

In [0]:
credit_data.groupBy('Purpose').agg({'Current Loan Amount': 'avg'}).show()

In [0]:
# TODO Recording: After running this cell to save the file

# Go to the left navigation menu - choose the "Data" option
# Select DBFS tab
# Filestore -> FileStore -> shared_uploads -> cloud_user@loonycorn.com
# Show that the CSV file is present there
# Go to the file starting with "part-00000"
# Copy the name of the file (click on the drop down and choose Copy File)
# Use the name of the copied file in the next cell

In [0]:
credit_data.groupBy('Purpose').count().write.csv('dbfs:/FileStore/shared_uploads/cloud.user@loonycorn.com/count_by_loan_purpose.csv')

In [0]:
print(dbutils.fs.head('use your file path here'))