In [None]:
import re

import nltk
from nltk.corpus import stopwords
import joblib
from pathlib import Path
from transformers import AutoTokenizer,pipeline
import pandas as pd
import numpy as np
import json
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

cache_dir = Path("./.cache")

memory = joblib.Memory(location=cache_dir, verbose=0)


def load_and_flatten(file: Path) -> pd.DataFrame:
    """
    Load a JSON file and flatten its structure into a pandas DataFrame.

    Args:
        file (Path): Path to the JSON file.
    Returns:
        pd.DataFrame: Flattened DataFrame.
    """
    data = json.load(open(file, "r"))
    df=pd.DataFrame(data).T
    df.reset_index(inplace=True)
    df.rename(columns={"index": "course_id"}, inplace=True)
    return df


def text_preproccessing(text: str) -> str:
    """
    Preprocess the input text by converting it to lowercase and removing punctuation.

    Args:
        text (str): The input text to preprocess.
    Returns:
        str: The preprocessed text.
    """
    text = text.casefold()
    punctuation_regex = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    text = re.sub(punctuation_regex, "", text)
    # remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001f600-\U0001f64f"  # emoticons
        "\U0001f300-\U0001f5ff"  # symbols & pictographs
        "\U0001f680-\U0001f6ff"  # transport & map symbols
        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
        "\U00002702-\U000027b0"
        "\U000024c2-\U0001f251"
        "]+",
        flags=re.UNICODE,
    )
    text = re.sub(emoji_pattern, "", text)
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])
    text = text.strip()
    return text


if __name__ == "__main__":
    json_file = Path("./course_data.json")
    df = load_and_flatten(json_file)

In [31]:
df.columns

Index(['course_id', 'course_title', 'num_ratings', 'useful', 'easy', 'liked',
       'reviews'],
      dtype='object')

In [None]:
reviews_df = df[["course_id", "reviews"]].explode("reviews").reset_index(drop=True)
reviews_df = reviews_df[reviews_df["reviews"].notnull()].copy()

reviews_df["review_text"] = reviews_df["reviews"].apply(lambda x: x.get("review_text") if isinstance(x, dict) else None)
reviews_df["review_rating"] = reviews_df["reviews"].apply(lambda x: x.get("course_rating") if isinstance(x, dict) else None)
reviews_df = reviews_df.drop(columns=["reviews"])
reviews_df.head()


Unnamed: 0,course_id,review_text,review_rating
0,CS 115,go to office hours and practice,liked course
1,CS 115,"One of my least favourite courses. Although things were nicely organized, Racket was such an annoying language to use. The one tangible benefit I felt after using Racket was feeling more comfortable with recursion.",disliked course
2,CS 115,"It starts with a very low pace but after midterm, it gets really fast and I cannot understand the content while reaching the end of term. Racket is restricted by so many things so I feel I lose the freedom of coding. The assignments are extremely hard after midterm, I spent a whole day on them but I can just solve the easiest one. The only positive impression that I have is that the design recipe weighs half so I pass this course successfully.",disliked course
3,CS 115,"Took this in 2018 with no programming experience ever. Started out pretty slow, but I didn't really pay attention and was severely behind. Went to a lot of office hours, but nonetheless caught up and did decently well. After taking more cs courses, racket really is a weird language.",liked course
4,CS 115,"I loved everything about cs 115. Great instructors, fair assignments ( can get tricky but there's a huge amount of office hours to attend ), good engagement and interesting topics! It's seriously well structured and I would recommend it to anyone even with no background in coding.",liked course
