# running the mario kart data 

In [22]:
# we need the library json as the reddit data is stored in line-delimited json objects
# (one json object in each line, with each line representing a Reddit comment)
import json

# function to load all comment data into a list of strings
# Input: the path of the file including our data
# Output: a list of strings including the body of the Reddit comments
def load_reddit_comment_data(data_directory):

    comments_data = [] # list object that will store the loaded Reddit comments

    # we first open the file that includes our dataset
    with open(data_directory, 'r', encoding='utf-8') as f:
        # iterate the file, reading it line by line
        for line in f:
            # load the data petraining to a line into a json object in memory
            data = json.loads(line)

            # append the comment if not removed
            if data['body']!="[removed]":
                comments_data.append(data['body'])

    # the method returns all the loaded Reddit comments
    return comments_data

In [16]:
import os

path_directory = r"C:\Users\ebbea\OneDrive - Delft University of Technology\DELFT\JAAR 3\Minor\NLP\Data"
json_files = ['mariokart_comments.ndjson', 'mariokart_submissions.ndjson', 'nintendo_comments.ndjson', 'nintendo_submissions.ndjson']

full_paths = [os.path.join(path_directory, filename) for filename in json_files]

In [38]:
mariokart_comments = load_reddit_comment_data(full_paths[0])
# mariokart_submissions = load_reddit_comment_data(full_paths[1])
# nintendo_comments = load_reddit_comment_data(full_paths[2])
# nintendo_submissions = load_reddit_comment_data(full_paths[3])

In [24]:
import nltk
import re
import pandas as pd
from nltk.util import ngrams
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))

# function to preprocess the Reddit comments
# Input: a string that includes a text corresponding to a Reddit comment
# Output: a string with the preprocessed Reddit comment
def preprocess(text):
    text = text.lower()  # convert text to lower-case
    text = re.sub('&gt;', '', text) # remove some special characters from the data &gt; corresponds to >
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation (keep only characters and numbers)
    return text

In [32]:
# run our function to preprocess all comments
preprocessed_comments = [preprocess(comment) for comment in mariokart_comments]

In [33]:
preprocessed_comments

['sure buddy pro tip wumao dont admit it it defeats the purpose',
 'i dont even know what that word is but still free history lesson facts are facts',
 'if you look at the mini map the leader didnt get hit by an item they intentionally slowed down to use the horn on op',
 'well there i go trying to give people more credit than they deserve lol failing at being an ass it is',
 'haha i appreciate your positivity did think there was maybe a chance that the leader accidentally drove off road but you can just about see they didnt from the vid so definitely some foul play ',
 'waaa x100',
 'ah yes chinas actual title',
 'i always play pink gold peach and ill use either the pipe frame or biddybuggy with roller wheels and 9 times out of 10 im waxing the floor with waluigi roy and dk so yeah ill stick with my favorite character ',
 'that was awesome',
 'wow is mk8s item thing different than the others or smt since you get some wacky combos like this',
 'runexpectedkingdomhearts',
 'dang that wa

In [27]:
from textblob import TextBlob

# Example function to get sentiment
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

get_sentiment('i used to love this track but now i love it')


0.5

In [28]:
def tracknames_regex() -> list[str]:
    with open('MarioKart_tracks.txt', 'r') as file:
        tracks = [line.strip() for line in file.readlines()]
    return tracks

In [34]:
import re


def get_track_mentions(comments: list[str], tracks: list[str]):
    filtered_comments = []

    for comment in comments:
        if any(re.search(track, comment) for track in tracks):
            filtered_comments.append(comment)

    return filtered_comments



In [39]:
tracknames = tracknames_regex()
track_comments = get_track_mentions(preprocessed_comments, tracknames_regex())

In [40]:
track_comments_dict: dict[str, list[str]] = {track: [] for track in tracknames}

In [41]:
compiled_patterns = {track: re.compile(track, re.IGNORECASE) for track in tracknames}

In [43]:
# Iterate over each track name and its compiled pattern
for track, pattern in compiled_patterns.items():
    # Check each comment to see if it contains the track pattern
    for comment in track_comments:
        if pattern.search(comment):
            track_comments_dict[track].append(comment)