In [1]:
import csv

def parse_got_csv(file_path):
    episodes = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            season = int(row['Season'])
            episode = int(row['Episode'])
            title = row['Title']
            rating = float(row['Rating'])

            key = f'S{season}E{episode}'
            episodes[key] = {
                'season': season,
                'episode': episode,
                'title': title,
                'rating': rating
            }

    return episodes

# Example usage
file_path = 'got_imdb.csv'
got_episodes = parse_got_csv(file_path)
for key, value in got_episodes.items():
    print(f"{key}: {value}")


S1E1: {'season': 1, 'episode': 1, 'title': 'Winter Is Coming', 'rating': 9.1}
S1E2: {'season': 1, 'episode': 2, 'title': 'The Kingsroad', 'rating': 8.8}
S1E3: {'season': 1, 'episode': 3, 'title': 'Lord Snow', 'rating': 8.7}
S1E4: {'season': 1, 'episode': 4, 'title': 'Cripples, Bastards, and Broken Things', 'rating': 8.8}
S1E5: {'season': 1, 'episode': 5, 'title': 'The Wolf and the Lion', 'rating': 9.1}
S1E6: {'season': 1, 'episode': 6, 'title': 'A Golden Crown', 'rating': 9.2}
S1E7: {'season': 1, 'episode': 7, 'title': 'You Win or You Die', 'rating': 9.2}
S1E8: {'season': 1, 'episode': 8, 'title': 'The Pointy End', 'rating': 9.0}
S1E9: {'season': 1, 'episode': 9, 'title': 'Baelor', 'rating': 9.6}
S1E10: {'season': 1, 'episode': 10, 'title': 'Fire and Blood', 'rating': 9.5}
S2E1: {'season': 2, 'episode': 1, 'title': 'The North Remembers', 'rating': 8.8}
S2E2: {'season': 2, 'episode': 2, 'title': 'The Night Lands', 'rating': 8.5}
S2E3: {'season': 2, 'episode': 3, 'title': 'What Is Dead M

## Statistical Correlation Overview

Statistical correlation is a measure that describes the extent to which two variables change together. In our context, we are interested in how the IMDb scores of 'Game of Thrones' episodes or seasons correlate with other variables (like episode or season number).

### Key Points:

- **Pearson Correlation Coefficient**: This is the most commonly used correlation measure. The Pearson correlation coefficient $ r $ is calculated as follows:

  $$
  r = \frac{\sum (X_i - \bar{X})(Y_i - \bar{Y})}{\sqrt{\sum (X_i - \bar{X})^2 \sum (Y_i - \bar{Y})^2}}
  $$

  Where:
  - $ X_i $ and $ Y_i $ are individual sample points.
  - $ \bar{X} $ and $ \bar{Y} $ are means of the samples $ X $ and $ Y $.
  - The summation is over all sample points.

  The coefficient ranges from -1 to 1, where 1 indicates a perfect positive correlation, -1 indicates a perfect negative correlation, and 0 indicates no correlation.

- **Positive vs Negative Correlation**:
  - *Positive Correlation*: As one variable increases, the other variable also increases.
  - *Negative Correlation*: As one variable increases, the other variable decreases.

- **Statistical Significance**: It’s important to determine whether a calculated correlation is statistically significant, which often requires further statistical testing.

### Usage in Our Analysis:

We will calculate the Pearson correlation coefficient to understand the relationship between IMDb scores and either episode numbers or season numbers. However, it's important to remember that correlation does not imply causation. A high or low correlation does not mean that one variable causes the other to change.


In [3]:
import scipy.stats

def episode_correlation(episodes, values, significance=0.05):
    """
    Calculate the Pearson correlation coefficient between IMDb scores and given values at the episode level.

    Args:
    episodes (list): List of IMDb scores for each episode.
    values (list): Corresponding values (like episode numbers) to check correlation with IMDb scores.
    significance (float): The significance level to determine statistical significance (default is 0.05).

    Returns:
    tuple: (Pearson correlation coefficient, boolean indicating significance)
    """
    if len(episodes) != len(values):
        raise ValueError("Length of episodes and values must be the same.")

    correlation, p_value = scipy.stats.pearsonr(episodes, values)
    is_significant = p_value < significance
    return correlation, is_significant

def season_correlation(seasons, values, significance=0.05):
    """
    Calculate the Pearson correlation coefficient between IMDb scores and given values at the season level.

    Args:
    seasons (list): List of average IMDb scores for each season.
    values (list): Corresponding values (like season numbers) to check correlation with IMDb scores.
    significance (float): The significance level to determine statistical significance (default is 0.05).

    Returns:
    tuple: (Pearson correlation coefficient, boolean indicating significance)
    """
    if len(seasons) != len(values):
        raise ValueError("Length of seasons and values must be the same.")

    correlation, p_value = scipy.stats.pearsonr(seasons, values)
    is_significant = p_value < significance
    return correlation, is_significant

# Example usage:
# episode_scores = [9.1, 8.8, 8.7, ...]  # List of IMDb scores for each episode
# episode_numbers = [1, 2, 3, ...]        # Corresponding episode numbers
# season_scores = [average_score_season_1, average_score_season_2, ...] # Average IMDb scores per season
# season_numbers = [1, 2, 3, ...]         # Corresponding season numbers

# Calculate correlations and test for significance
# episode_corr, episode_significant = episode_correlation(episode_scores, episode_numbers, 0.05)
# season_corr, season_significant = season_correlation(season_scores, season_numbers, 0.05)

# Get the imdb score for each episode from got_imdb.csv and store in a list 

imdb_scores = []
for key, value in got_episodes.items():
    imdb_scores.append(value['rating'])

print(imdb_scores)

episode_correlation(imdb_scores, imdb_scores, 0.05)


[9.1, 8.8, 8.7, 8.8, 9.1, 9.2, 9.2, 9.0, 9.6, 9.5, 8.8, 8.5, 8.8, 8.8, 8.8, 9.1, 8.9, 8.8, 9.7, 9.4, 8.8, 8.6, 8.9, 9.6, 9.0, 8.8, 8.7, 9.0, 9.9, 9.2, 9.1, 9.7, 8.9, 8.8, 8.8, 9.7, 9.1, 9.7, 9.6, 9.7, 8.5, 8.5, 8.5, 8.7, 8.6, 8.0, 9.0, 9.9, 9.5, 9.1, 8.5, 9.4, 8.7, 9.1, 9.7, 8.4, 8.6, 8.4, 9.9, 9.9, 8.6, 8.9, 9.2, 9.8, 8.8, 9.0, 9.4, 7.5, 7.8, 7.4, 5.4, 5.9, 4.0]


(0.9999999999999998, True)