<a href="https://colab.research.google.com/github/06Nandhini/RDD-analysis-in-Pyspark/blob/main/RDD_analysis_in_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

# Read CSV file
data = sc.textFile("EPL.csv")

# Split by tab (since your sample shows tab-separated)
header = data.first()  # save header
matches_rdd = data.filter(lambda row: row != header).map(lambda row: row.split("\t"))

# Check RDD
matches_rdd.take(5)


[['Sheffield United,Liverpool,1.0,1.0,D,2006-2007'],
 ['Arsenal,Aston Villa,1.0,1.0,D,2006-2007'],
 ['Everton,Watford,2.0,1.0,H,2006-2007'],
 ['Newcastle United,Wigan Athletic,2.0,1.0,H,2006-2007'],
 ['Portsmouth,Blackburn Rovers,3.0,0.0,H,2006-2007']]

In [11]:
header = data.first()  # 'home_team,away_team,...'
matches_rdd = data.filter(lambda row: row != header) \
                  .map(lambda row: row.split(",")) \
                  .filter(lambda x: len(x) == 6)  # keep only valid rows


In [12]:
print("Total rows:", matches_rdd.count())
print("Sample rows:", matches_rdd.take(5))


Total rows: 4560
Sample rows: [['Sheffield United', 'Liverpool', '1.0', '1.0', 'D', '2006-2007'], ['Arsenal', 'Aston Villa', '1.0', '1.0', 'D', '2006-2007'], ['Everton', 'Watford', '2.0', '1.0', 'H', '2006-2007'], ['Newcastle United', 'Wigan Athletic', '2.0', '1.0', 'H', '2006-2007'], ['Portsmouth', 'Blackburn Rovers', '3.0', '0.0', 'H', '2006-2007']]


In [13]:
matches_rdd = matches_rdd.map(lambda x: [
    x[0],                 # home_team
    x[1],                 # away_team
    float(x[2]),          # home_goals
    float(x[3]),          # away_goals
    x[4],                 # result
    x[5]                  # season
])


In [14]:
season_goals = matches_rdd.map(lambda x: (x[5], x[2] + x[3])) \
                          .reduceByKey(lambda a, b: a + b)

highest_season = season_goals.max(key=lambda x: x[1])
lowest_season = season_goals.min(key=lambda x: x[1])

print("Season with highest goals:", highest_season)
print("Season with lowest goals:", lowest_season)


Season with highest goals: ('2011-2012', 1066.0)
Season with lowest goals: ('2006-2007', 931.0)


In [15]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

# 1️⃣ Read data into RDD
data = sc.textFile("EPL.csv")   # Correct file name
header = data.first()

matches_rdd = data.filter(lambda row: row != header) \
                  .map(lambda row: row.split(",")) \
                  .filter(lambda x: len(x) == 6) \
                  .map(lambda x: [
                      x[0],          # home_team
                      x[1],          # away_team
                      float(x[2]),   # home_goals
                      float(x[3]),   # away_goals
                      x[4],          # result
                      x[5]           # season
                  ])

print("Sample rows:", matches_rdd.take(5))

# 2️⃣ Season with highest and lowest total goals
season_goals = matches_rdd.map(lambda x: (x[5], x[2] + x[3])) \
                          .reduceByKey(lambda a, b: a + b)

highest_season = season_goals.max(key=lambda x: x[1])
lowest_season = season_goals.min(key=lambda x: x[1])

print("Season with highest goals:", highest_season)
print("Season with lowest goals:", lowest_season)

# 3️⃣ Team with highest average goals per season
home_goals = matches_rdd.map(lambda x: (x[0], (x[2], 1)))
away_goals = matches_rdd.map(lambda x: (x[1], (x[3], 1)))

total_goals = home_goals.union(away_goals) \
                        .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1]))

team_seasons = matches_rdd.flatMap(lambda x: [(x[0], x[5]), (x[1], x[5])]) \
                          .distinct() \
                          .map(lambda x: (x[0], 1)) \
                          .reduceByKey(lambda a, b: a + b)

team_avg_goals = total_goals.join(team_seasons) \
                            .map(lambda x: (x[0], x[1][0][0] / x[1][1]))

highest_avg_team = team_avg_goals.max(key=lambda x: x[1])
print("Team with highest average goals per season:", highest_avg_team)

# 4️⃣ Probabilities for Manchester United
team = "Manchester United"

home_results = matches_rdd.filter(lambda x: x[0] == team) \
                          .map(lambda x: x[4])

away_results = matches_rdd.filter(lambda x: x[1] == team) \
                          .map(lambda x: 'W' if x[4]=='L' else ('L' if x[4]=='W' else 'D'))

all_results = home_results.union(away_results)
total_matches = all_results.count()

result_counts = all_results.map(lambda x: (x, 1)) \
                           .reduceByKey(lambda a, b: a+b) \
                           .collectAsMap()

p_win = result_counts.get('W', 0) / total_matches
p_lose = result_counts.get('L', 0) / total_matches
p_draw = result_counts.get('D', 0) / total_matches

print(f"P(Manchester United Wins): {p_win:.2f}")
print(f"P(Manchester United Loses): {p_lose:.2f}")
print(f"P(Manchester United Draws): {p_draw:.2f}")


Sample rows: [['Sheffield United', 'Liverpool', 1.0, 1.0, 'D', '2006-2007'], ['Arsenal', 'Aston Villa', 1.0, 1.0, 'D', '2006-2007'], ['Everton', 'Watford', 2.0, 1.0, 'H', '2006-2007'], ['Newcastle United', 'Wigan Athletic', 2.0, 1.0, 'H', '2006-2007'], ['Portsmouth', 'Blackburn Rovers', 3.0, 0.0, 'H', '2006-2007']]
Season with highest goals: ('2011-2012', 1066.0)
Season with lowest goals: ('2006-2007', 931.0)
Team with highest average goals per season: ('Manchester United', 72.25)
P(Manchester United Wins): 0.00
P(Manchester United Loses): 0.00
P(Manchester United Draws): 0.57
