In [1]:
import scipy.stats
import csv

def parse_got_csv(file_path):
    episodes = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            season = int(row['Season'])
            episode = int(row['Episode'])
            title = row['Title']
            rating = float(row['Rating'])

            key = f'S{season}E{episode}'
            episodes[key] = {
                'season': season,
                'episode': episode,
                'title': title,
                'rating': rating
            }

    return episodes

# Example usage
file_path = 'got_imdb.csv'
got_episodes = parse_got_csv(file_path)
for key, value in got_episodes.items():
    print(f"{key}: {value}")

def episode_correlation(episodes, values, significance=0.05):
    """
    Calculate the Pearson correlation coefficient between IMDb scores and given values at the episode level.

    Args:
    episodes (list): List of IMDb scores for each episode.
    values (list): Corresponding values (like episode numbers) to check correlation with IMDb scores.
    significance (float): The significance level to determine statistical significance (default is 0.05).

    Returns:
    tuple: (Pearson correlation coefficient, boolean indicating significance)
    """
    if len(episodes) != len(values):
        raise ValueError("Length of episodes and values must be the same.")

    correlation, p_value = scipy.stats.pearsonr(episodes, values)
    is_significant = p_value < significance
    return correlation, is_significant

def season_correlation(seasons, values, significance=0.05):
    """
    Calculate the Pearson correlation coefficient between IMDb scores and given values at the season level.

    Args:
    seasons (list): List of average IMDb scores for each season.
    values (list): Corresponding values (like season numbers) to check correlation with IMDb scores.
    significance (float): The significance level to determine statistical significance (default is 0.05).

    Returns:
    tuple: (Pearson correlation coefficient, boolean indicating significance)
    """
    if len(seasons) != len(values):
        raise ValueError("Length of seasons and values must be the same.")

    correlation, p_value = scipy.stats.pearsonr(seasons, values)
    is_significant = p_value < significance
    return correlation, is_significant

# Example usage:
# episode_scores = [9.1, 8.8, 8.7, ...]  # List of IMDb scores for each episode
# episode_numbers = [1, 2, 3, ...]        # Corresponding episode numbers
# season_scores = [average_score_season_1, average_score_season_2, ...] # Average IMDb scores per season
# season_numbers = [1, 2, 3, ...]         # Corresponding season numbers

# Calculate correlations and test for significance
# episode_corr, episode_significant = episode_correlation(episode_scores, episode_numbers, 0.05)
# season_corr, season_significant = season_correlation(season_scores, season_numbers, 0.05)

# Get the imdb score for each episode from got_imdb.csv and store in a list 

imdb_scores = []
for key, value in got_episodes.items():
    imdb_scores.append(value['rating'])

print(imdb_scores)

episode_correlation(imdb_scores, imdb_scores, 0.05)



S1E1: {'season': 1, 'episode': 1, 'title': 'Winter Is Coming', 'rating': 9.1}
S1E2: {'season': 1, 'episode': 2, 'title': 'The Kingsroad', 'rating': 8.8}
S1E3: {'season': 1, 'episode': 3, 'title': 'Lord Snow', 'rating': 8.7}
S1E4: {'season': 1, 'episode': 4, 'title': 'Cripples, Bastards, and Broken Things', 'rating': 8.8}
S1E5: {'season': 1, 'episode': 5, 'title': 'The Wolf and the Lion', 'rating': 9.1}
S1E6: {'season': 1, 'episode': 6, 'title': 'A Golden Crown', 'rating': 9.2}
S1E7: {'season': 1, 'episode': 7, 'title': 'You Win or You Die', 'rating': 9.2}
S1E8: {'season': 1, 'episode': 8, 'title': 'The Pointy End', 'rating': 9.0}
S1E9: {'season': 1, 'episode': 9, 'title': 'Baelor', 'rating': 9.6}
S1E10: {'season': 1, 'episode': 10, 'title': 'Fire and Blood', 'rating': 9.5}
S2E1: {'season': 2, 'episode': 1, 'title': 'The North Remembers', 'rating': 8.8}
S2E2: {'season': 2, 'episode': 2, 'title': 'The Night Lands', 'rating': 8.5}
S2E3: {'season': 2, 'episode': 3, 'title': 'What Is Dead M

(0.9999999999999998, True)

In [2]:
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

# Ensure that NLTK's resources are available
nltk.download('punkt')

# Read the CSV file
got_data = pd.read_csv("Game_of_Thrones_Script.csv", delimiter=';')

# Combine the sentences for each episode
grouped_data = got_data.groupby(['Season', 'Episode'])['Sentence'].apply(' '.join).reset_index()

# Tokenize the text for each episode
grouped_data['Tokenized Sentences'] = grouped_data['Sentence'].apply(word_tokenize)

print(grouped_data.head())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/roidalsgard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


     Season     Episode                                           Sentence  \
0  Season 1   Episode 1  What do you expect? They're savages. One lot s...   
1  Season 1  Episode 10  Look at me. Look at me! Do you remember me now...   
2  Season 1   Episode 2  You need to drink, child... And eat. Isn't the...   
3  Season 1   Episode 3  Welcome, Lord Stark. Grand Maester Pycelle has...   
4  Season 1   Episode 4  The little Lordo s been dreaming again. We hav...   

                                 Tokenized Sentences  
0  [What, do, you, expect, ?, They, 're, savages,...  
1  [Look, at, me, ., Look, at, me, !, Do, you, re...  
2  [You, need, to, drink, ,, child, ..., And, eat...  
3  [Welcome, ,, Lord, Stark, ., Grand, Maester, P...  
4  [The, little, Lordo, s, been, dreaming, again,...  


In [14]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Ensure that NLTK's resources are available
nltk.download('punkt')

# Read the Game of Thrones script data
got_data = pd.read_csv('Game_of_Thrones_Script.csv', delimiter=';')

# Combine the sentences for each episode and tokenize
grouped_data = got_data.groupby(['Season', 'Episode'])['Sentence'].apply(' '.join).reset_index()
grouped_data['Tokenized Sentences'] = grouped_data['Sentence'].apply(word_tokenize)

# Define a function to calculate the LIX number
def calculate_lix(text):
    words = word_tokenize(text)
    num_words = len(words)
    num_sentences = text.count('.') + text.count('!') + text.count('?')
    long_words = sum(len(word) >= 6 for word in words)

    # Avoid division by zero
    if num_sentences == 0:
        num_sentences = 1
    
    lix = num_words / num_sentences + (long_words / num_words) * 100
    return lix

# Calculate the LIX number for each episode
grouped_data['LIX Number'] = grouped_data['Sentence'].apply(calculate_lix)

# Extract LIX numbers
lix_numbers = grouped_data['LIX Number'].tolist()

grouped_data.head(), lix_numbers

def parse_got_csv(file_path):
    episodes = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            season = int(row['Season'])
            episode = int(row['Episode'])
            title = row['Title']
            rating = float(row['Rating'])

            key = f'S{season}E{episode}'
            episodes[key] = {
                'season': season,
                'episode': episode,
                'title': title,
                'rating': rating
            }

    return episodes

# Example usage
file_path = 'got_imdb.csv'
got_imdb_data = parse_got_csv('got_imdb.csv')

# Adjusting the script to correctly reference IMDb ratings keys
# Extracting the season and episode numbers from the grouped_data
season_episode_pairs = [(int(season.split()[-1]), int(episode.split()[-1])) for season, episode in zip(grouped_data['Season'], grouped_data['Episode'])]

# Extracting IMDb scores in the correct order
ordered_imdb_scores = [got_imdb_data[season_episode]['rating'] for season_episode in season_episode_pairs]

# Recalculating the correlation between LIX numbers and IMDb ratings
lix_imdb_correlation, lix_imdb_significant = episode_correlation(ordered_imdb_scores, lix_numbers)

lix_imdb_correlation, lix_imdb_significant

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/roidalsgard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'csv' is not defined