<a href="https://colab.research.google.com/github/ChenAndy-7/Professor-Review-Rating-Predictor/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Disable Weights & Biases inside Colab (no external logging)
os.environ["WANDB_DISABLED"] = "true"

# Install required libraries (most are already present in Colab, but this is safe)
!pip install planetterp torch transformers pandas numpy matplotlib scikit-learn


In [None]:
import planetterp
import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    confusion_matrix,
    classification_report,
)



In [None]:
#prof list
professors = [
    "Hatice Sahinoglu",
    "Larry Herman",
    "Ilchul Yoon",
    "Clyde Kruskal",
    "Paul Kline",
    "Pedram Sadeghian",
]

all_reviews = []
#getting the reviews for all the professors
for prof_name in professors:
    try:
        prof = planetterp.professor(name=prof_name, reviews=True)
        reviews = prof.get("reviews", [])
        for review in reviews:
            all_reviews.append({
                "professor": prof_name,
                "review": review.get("review", ""),
                "rating": review.get("rating", 0),
            })
        print(f"Fetched {len(reviews)} reviews for {prof_name}")
    except Exception as e:
        print("error with fetching reviews")

df = pd.DataFrame(all_reviews)
print(f"\nTotal reviews collected: {len(df)}")
df.head()


In [None]:
#data cleansing step:
# Remove empty or very short reviews and invalid ratings
df = df[df["review"].notna()]
df = df[df["review"].str.len() > 50]   # at least 50 characters

print(f"Clean dataset: {len(df)} reviews\n")

print("Rating distribution")
print(df["rating"].value_counts().sort_index())


In [None]:
# Convert ratings (1 to 5) to labels (0 to 4) for the model
df["label"] = df["rating"] - 1

#main issue with the dataset was that 1 and 5s were much more likely to appear
#students that typically leave reviews are typically really happy or really upset
#with the professor so I balance the dataset by upsampling minority classes
df_bal = df.copy()
groups = [g for _, g in df_bal.groupby("label")]
max_size = max(len(g) for g in groups)

balanced_groups = [
    resample(
        g,
        replace=True,
        n_samples=max_size,
    )
    for g in groups
]
#combined all the upsampled class's, shuffled the dataset to ensure a final balanced
#dataset so that the DS is randomized b4 splitting
df_bal = pd.concat(balanced_groups).sample(
    frac=1
).reset_index(drop=True)

print(df_bal["label"].value_counts().sort_index())

#splitting dataset
train_df, temp_df = train_test_split(
    df_bal,
    test_size=0.2,
    stratify=df_bal["label"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"]
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)



In [None]:
#pre-trained model used
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

# Load tokenizer and data collator
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#made a class for function usuability later on
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    #look at specific reviews and the stars they gave
    def __getitem__(self, idx):
        review = str(self.data.loc[idx, "review"])
        label = int(self.data.loc[idx, "label"])

    #tokenize the review
        encoding = self.tokenizer(
            review,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }


# Create datasets
train_dataset = ReviewDataset(train_df, tokenizer)
val_dataset = ReviewDataset(val_df, tokenizer)
test_dataset = ReviewDataset(test_df, tokenizer)

print("Datasets created successfully.")
print(f"Sample encoding shape: {train_dataset[0]['input_ids'].shape}")


In [None]:
# Load model for sequence classification with 5 labels (0 to 4 -> 1 to 5 stars)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,
)



In [None]:
import torch.nn as nn
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)


    preds_stars = preds + 1
    labels_stars = labels + 1
    mae = mean_absolute_error(labels_stars, preds_stars)

    return {
        "accuracy": acc,
        "mae": mae
    }



#uses class_weights instead of the normal crossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **e_args):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



# Training arguments
#my hyperparameters
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,     #ran with > 3 epochs but had diminishing returns so
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,     #tweaked LR a lot
    weight_decay=0.015,     #also tweaked this
    label_smoothing_factor=0.1,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

print("args set")


In [None]:
# Create trainer with early stopping on validation accuracy
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)


train_result = trainer.train()




In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(val_labels, val_preds)

plt.figure(figsize=(6, 5))
plt.imshow(cm)  # default colormap
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")

# Add numbers to each cell
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]),
                 ha='center', va='center')

plt.colorbar()
plt.tight_layout()
plt.show()
