In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from load_clean_data import load_clean_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("Task4").getOrCreate()


In [3]:
# Load the data
df = load_clean_data("./data", spark)


In [4]:
# Group by channel and calculate the mean and median of the trusting points
df_grouped = df.groupBy("_channel").agg(
    F.mean("_trust").alias("mean_trusting_points"),
    F.median("_trust").alias("median_trusting_points")
)


In [7]:
# Show the results
df_grouped.show(df_grouped.count(), False)


+----------------+--------------------+----------------------+
|_channel        |mean_trusting_points|median_trusting_points|
+----------------+--------------------+----------------------+
|prizeplank      |1.0                 |1.0                   |
|instagc         |0.9086886074429764  |0.9259                |
|bitcoinget      |0.9577211864406794  |0.9615                |
|tremorgames     |0.8287998293970339  |0.8108                |
|coinworker      |0.8390619450317137  |0.8571                |
|keeprewarding   |0.866699999999999   |0.8667                |
|inboxpounds     |0.9375              |0.9375                |
|neodev          |0.8399505561040375  |0.8378                |
|zoombucks       |0.9283105102818009  |0.9189                |
|cotter          |0.9188999999999999  |0.9189                |
|surveymad       |0.8474142857142861  |0.8182                |
|tasks4dollars   |0.8106617647058824  |0.8125                |
|globalactioncash|0.7999999999999998  |0.8             

## Code below this point is testing for Task 6

In [27]:
from pyspark.sql.types import ArrayType, StringType

# replicating task 2
def top_n_reasons(reasons, counts, n=5):
    return [reason for reason, count in sorted(zip(reasons, counts), key=lambda x: x[1], reverse=True)[:n]]

df_non_unknown = df.filter((F.col("airline").isNotNull()) & (F.col("airline") != "NULL") & (F.col("negativereason1").isNotNull()) & (F.col("negativereason1") != "Unknown"))
top_n_udf = F.udf(top_n_reasons, ArrayType(StringType()))

top_neg_reasons = df_non_unknown.groupBy('airline', 'negativereason1') \
    .agg(F.count('negativereason1').alias('reason_count'))

# Group by airline again to collect all reasons and counts into lists
airline_reasons = top_neg_reasons.groupBy('airline') \
    .agg(F.collect_list('negativereason1').alias('all_reasons'), 
         F.collect_list('reason_count').alias('all_counts'))

# top 5 reasons for each airline
top_5_neg_reasons = airline_reasons.withColumn('top_5_reasons', top_n_udf('all_reasons', 'all_counts'))

top_5_neg_reasons.select('airline', 'top_5_reasons').show(truncate=False)

+--------------+--------------------------------------------------+
|airline       |top_5_reasons                                     |
+--------------+--------------------------------------------------+
|Delta         |[CSProblem, late, canttell, cancel, lostluggae]   |
|Virgin America|[CSProblem, canttell, booking, badflight, cancel] |
|United        |[CSProblem, late, canttell, cancel, lostluggae]   |
|US Airways    |[CSProblem, late, canttell, cancel, lostluggae]   |
|Southwest     |[CSProblem, cancel, badflight, canttell, late]    |
|American      |[CSProblem, late, cancel, canttell, airplanestaff]|
+--------------+--------------------------------------------------+

