In [15]:
#my preferred version of the Monty Hall problem
import pandas as pd
from collections import defaultdict
import re  # For regular expressions

# Load the dataset
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-08-11/avatar.csv"
avatar = pd.read_csv(url)

# Create the list of words by combining character names and full text
words = ("\n" + avatar.character.str.upper().str.replace(' ', '.') + ": " + avatar.full_text + " ").sum().split(' ')

# Define a function to remove punctuation from words (to normalize similar words)
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word)  # Remove punctuation, keep alphanumeric and whitespace characters

# Initialize the word usage and transition dictionaries
word_used = defaultdict(int)  # Tracks how often each word occurs
next_word = defaultdict(lambda: defaultdict(int))  # Nested defaultdict for transitions and their counts

# Build the word occurrence and transition model
for i, word in enumerate(words[:-1]):
    clean_current_word = clean_word(word)
    clean_next_word = clean_word(words[i+1])
    
    word_used[clean_current_word] += 1
    
    # The next word in the sequence
    next_word[clean_current_word][clean_next_word] += 1  # Count the transition from `clean_current_word` to `clean_next_word`

# Get all words that follow 'AVATAR' and sort them by frequency
avatar_next_words = next_word['Avatar']
sorted_avatar_next_words = sorted(avatar_next_words.items(), key=lambda x: x[1], reverse=True)

# Print the sorted word counts
print("Words following 'AVATAR' sorted by frequency:")
for word, count in sorted_avatar_next_words:
    print(f"'{word}': {count},")


Words following 'AVATAR' sorted by frequency:
'State': 67,
'is': 41,
'Roku': 33,
'and': 29,
'
SCENEDESCRIPTION': 19,
'Aang': 16,
'has': 14,
'
AANG': 11,
'Kyoshi': 11,
'
SOKKA': 11,
'I': 11,
'to': 9,
'will': 7,
'
ZUKO': 7,
'before': 6,
'': 6,
'who': 6,
'Day': 6,
'players': 6,
'was': 5,
'would': 5,
'You': 5,
'Rokus': 5,
'looks': 5,
'
KATARA': 4,
'for': 4,
'but': 4,
'stuff': 4,
'He': 4,
'Kuruk': 4,
'Yangchen': 4,
'Cut': 4,
'
AZULA': 4,
'looking': 4,
'World': 3,
'thing': 3,
'But': 3,
'Spirit': 3,
'
IROH': 3,
'standing': 3,
'can': 3,
'The': 3,
'with': 3,
'walking': 3,
'look': 3,
'from': 3,
'we': 3,
'in': 2,
'himself': 2,
'stands': 2,
'If': 2,
'My': 2,
'on': 2,
'huh': 2,
'Turns': 2,
'Sokka': 2,
'its': 2,
'must': 2,
'it': 2,
'Thats': 2,
'Its': 2,
'youre': 2,
'Who': 2,
'powers': 2,
'We': 2,
'walks': 2,
'Now': 2,
'sleeping': 2,
'so': 2,
'reaches': 2,
'Points': 2,
'cheers': 2,
'journey': 2,
'doesnt': 2,
'are': 2,
'line': 2,
'kept': 1,
'mastered': 1,
'Helmsman': 1,
'Zuko': 1,
'master': 1,
'an': 1

In [None]:
#The conversation between ChatBox and me for Question3
#https://chatgpt.com/c/66f0ce11-5a14-8001-bf81-4ddb07ae2e9d



#Here's a summary of our conversation:
#1. **Initial Code and Issue**: 
#   - You shared a code that creates a Markov model-like structure based on text from a dataset. The issue you 
#encountered was a `TypeError` when incrementing word transitions (`next_word[word][words[i+1]] += 1`).

#2. **Fixing the `TypeError`**:
#   - The error was due to `next_word[word]` being treated as an `int` instead of a nested dictionary. I helped 
#fix this by using a `defaultdict(lambda: defaultdict(int))`, ensuring that word transitions are counted properly.

#3. **Grouping Words with Punctuation**:
#   - You wanted to group words like `'Roku?'`, `'Roku!'`, and `'Roku;'` under `'Roku'`. I helped by introducing a
#`clean_word()` function using regular expressions to remove punctuation, ensuring variations of the same word are 
#grouped.

#4. **Sorting Word Counts**:
#   - You asked how to sort word counts, such as `'will': 7, 'Helmsman': 1, 'is': 41`, to display them from most to
#least frequent. I provided code to sort and print the word counts in descending order.

#5. **Displaying Words Following a Specific Word**:
#   - You asked how to display and sort all words following the word `'Avatar'`. I helped by extracting the words 
#that follow `'Avatar'` in the Markov model, sorting them by frequency, and printing the results.

#Each step involved fixing issues or enhancing the Markov model to handle the dataset and output meaningful results
#based on word transitions.

In [None]:
#Question6:
#ChatBots are really helpful, since it can quickly understand my questions and help me to solve it. Also, it can 
#read through and explain the programme for me, which helps me out and save my time to learning these programmes. 
#For the learning later on, I think I will still using chatbox for helping me to understand the codes since it can 
#save me time, although ChatBox might give a wrong answer, than I will going to ask it again with a different way.



#Question7:
#By having conversation these few days, I find out that ChatBox can do lots more than I thought before. I can write
#code, if I gives it the directives and it can also help me to explain the error in the code and gave me some 
#solution to solve it. I predict that later on maybe we through data to ChatBox and tell them what result we need it
#can code by it self and use different statistic mode and gives us the best result. Or even, it can shows the result
#it runs out and analyze why other module is not good enough.