# running the mario kart data 

In [22]:
# we need the library json as the reddit data is stored in line-delimited json objects
# (one json object in each line, with each line representing a Reddit comment)
import json

# function to load all comment data into a list of strings
# Input: the path of the file including our data
# Output: a list of strings including the body of the Reddit comments
def load_reddit_comment_data(data_directory):

    comments_data = [] # list object that will store the loaded Reddit comments

    # we first open the file that includes our dataset
    with open(data_directory, 'r', encoding='utf-8') as f:
        # iterate the file, reading it line by line
        for line in f:
            # load the data petraining to a line into a json object in memory
            data = json.loads(line)

            # append the comment if not removed
            if data['body']!="[removed]":
                comments_data.append(data['body'])

    # the method returns all the loaded Reddit comments
    return comments_data

In [16]:
import os

path_directory = r"C:\Users\ebbea\OneDrive - Delft University of Technology\DELFT\JAAR 3\Minor\NLP\Data"
json_files = ['mariokart_comments.ndjson', 'mariokart_submissions.ndjson', 'nintendo_comments.ndjson', 'nintendo_submissions.ndjson']

full_paths = [os.path.join(path_directory, filename) for filename in json_files]

In [23]:
mariokart_comments = load_reddit_comment_data(full_paths[0])
# mariokart_submissions = load_reddit_comment_data(full_paths[1])
# nintendo_comments = load_reddit_comment_data(full_paths[2])
# nintendo_submissions = load_reddit_comment_data(full_paths[3])

In [24]:
import nltk
import re
import pandas as pd
from nltk.util import ngrams
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))

# function to preprocess the Reddit comments
# Input: a string that includes a text corresponding to a Reddit comment
# Output: a string with the preprocessed Reddit comment
def preprocess(text):
    text = text.lower()  # convert text to lower-case
    text = re.sub('&gt;', '', text) # remove some special characters from the data &gt; corresponds to >
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation (keep only characters and numbers)
    return text

In [25]:
# run our function to preprocess all comments
preprocessed_comments = [preprocess(comment) for comment in mariokart_comments]

In [26]:
preprocessed_comments

['the worlds are a lot more straightforward and you only get 35 moons per world iirc i do like that part of galaxy and how each small level has a traditional platforming focus what i didnt like about odyssey was the lack of guidance on where to find moons not that they should be marked on a map but there was no title or explanation like in previous mario games as a result i explored until i couldnt then turned to the guides and suddenly i had every moon after 55 hours of play',
 'the only thing that gives me hope is that maybe they adopt some of arceus ideas',
 'yeah i hear you to each their own although odyssey does have a map that lists out all the moons and gives a hint of how to find it thats the way i have played it exploring and puzzling over what the hint means was part of the fun',
 'it seems like youre asking a tech support question or are looking for help in a game posts seeking either advice or questions about tech support game help and other helpsupport questions should be 

In [27]:
from textblob import TextBlob

# Example function to get sentiment
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

get_sentiment('i used to love this track but now i love it')


0.5

In [28]:
def tracknames_regex() -> list[str]:
    with open('MarioKart_tracks.txt', 'r') as file:
        tracks = [line.strip() for line in file.readlines()]
    return tracks

In [29]:
def get_track_mentions(comments: list[str], tracks: list[str]):
    filtered_comments = []

    for comment in comments:
        if any(track in comment for track in tracks):
            filtered_comments.append(comment)

    return filtered_comments



In [30]:
get_track_mentions(preprocessed_comments, tracknames_regex())

[]