# Import dataset

In [1]:
import pandas as pd

# Step 1: Load the dataset

df = pd.read_csv('train.csv')

# Step 2: Inspect the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Step 3: Check the dataset info
print("\nDataset info:")
print(df.info())

First 5 rows of the dataset:
     id                                               text     review
0  7961  Honestly the best part of this place is the un...  Excellent
1  4697  Found Indulge on a whim, based on their huge "...  Excellent
2  4459  My take on Mill street is that it's your class...  Very good
3  3714  I think Matt's has had its '5 minutes of fame'...        Bad
4  4744  Nobody likes going to the auto body shop..peri...  Excellent

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7000 non-null   int64 
 1   text    7000 non-null   object
 2   review  7000 non-null   object
dtypes: int64(1), object(2)
memory usage: 164.2+ KB
None


# Preprocessing

In [6]:
# Doing some preprocessing
import re

# convert all comments to lowercase
df['text'] = df['text'].str.lower()

# remove {URLs, numbers, punctuation}
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'<.*?>', '', text)    # remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text)     # remove extra spaces
    return text.strip()

df['text'] = df['text'].apply(clean_text)

In [7]:
# tokenization
df['tokens'] = df['text'].str.split()
df['tokens'].head(5)

0    [honestly, the, best, part, of, this, place, i...
1    [found, indulge, on, a, whim, based, on, their...
2    [my, take, on, mill, street, is, that, its, yo...
3    [i, think, matts, has, had, its, minutes, of, ...
4    [nobody, likes, going, to, the, auto, body, sh...
Name: tokens, dtype: object

In [8]:
# stop words removal
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print(df['tokens'].head())


0    [honestly, best, part, place, unbelievable, de...
1    [found, indulge, whim, based, huge, glutenfree...
2    [take, mill, street, classic, collegetown, mai...
3    [think, matts, minutes, fame, note, owners, fo...
4    [nobody, likes, going, auto, body, shopperiod,...
Name: tokens, dtype: object


In [9]:
# applying lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print(df['tokens'].head())


0    [honestly, best, part, place, unbelievable, de...
1    [found, indulge, whim, based, huge, glutenfree...
2    [take, mill, street, classic, collegetown, mai...
3    [think, matt, minute, fame, note, owner, food,...
4    [nobody, like, going, auto, body, shopperiod, ...
Name: tokens, dtype: object


In [129]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode labels as integers
le = LabelEncoder()
df['label'] = le.fit_transform(df['review'])  # 0,1,2,3,4

print("Class mapping:")
for i, class_name in enumerate(le.classes_):
    print(i, ":", class_name)

Class mapping:
0 : Bad
1 : Excellent
2 : Good
3 : Very bad
4 : Very good


In [130]:
df.head(10)

Unnamed: 0,id,text,review,tokens,label
0,7961,honestly the best part of this place is the un...,Excellent,"[honestly, best, part, place, unbelievable, de...",1
1,4697,found indulge on a whim based on their huge gl...,Excellent,"[found, indulge, whim, based, huge, glutenfree...",1
2,4459,my take on mill street is that its your classi...,Very good,"[take, mill, street, classic, collegetown, mai...",4
3,3714,i think matts has had its minutes of fame note...,Bad,"[think, matt, minute, fame, note, owner, food,...",0
4,4744,nobody likes going to the auto body shopperiod...,Excellent,"[nobody, like, going, auto, body, shopperiod, ...",1
5,872,on my way to the airport i decided to stop at ...,Good,"[way, airport, decided, stop, ted, bite, eat, ...",2
6,4100,roads are good service not so good this place ...,Bad,"[road, good, service, good, place, tourist, tr...",0
7,8995,ive known my way around sky harbor airport sin...,Bad,"[ive, known, way, around, sky, harbor, airport...",0
8,657,had lunch here today after hearing all the col...,Good,"[lunch, today, hearing, colossal, raf, however...",2
9,2942,they dont serve food they serve sex on a plate...,Excellent,"[dont, serve, food, serve, sex, plate, first, ...",1


In [131]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(df['label'], num_classes = 5)  # One-hot encoded labels


In [132]:
print(y)

[[0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]]


In [133]:
# Convert list of tokens to space-separated string
df['tokens_str'] = df['tokens'].apply(lambda x: ' '.join(x))
texts = df['tokens_str'].values
print(df['tokens_str'].head());

0    honestly best part place unbelievable deal get...
1    found indulge whim based huge glutenfree menu ...
2    take mill street classic collegetown main stri...
3    think matt minute fame note owner food good en...
4    nobody like going auto body shopperiod guy too...
Name: tokens_str, dtype: object


In [134]:
#from sklearn.feature_extraction.text import TfidfVectorizer

# # text vectorization using TF-IDF
# vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,1), stop_words='english')
# X = vectorizer.fit_transform(texts)

from gensim.models import Word2Vec
sentences = df['tokens'].tolist()
w2v_model = Word2Vec(
    sentences,
    vector_size=100,   # size of embedding vectors
    window=5,          # context window
    min_count=1,       # ignore words with frequency < 1
    workers=4,
    sg=1               # skip-gram model
)

def get_review_vector(tokens, model, vector_size=100):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Convert all reviews to vectors
X_w2v = np.array([get_review_vector(tokens, w2v_model) for tokens in df['tokens']])
print("Shape of Word2Vec feature matrix:", X_w2v.shape)



Shape of Word2Vec feature matrix: (7000, 100)


In [136]:
# Training data
X_train = np.array([get_review_vector(tokens, w2v_model) for tokens in df['tokens']])
# Test data
X_test = np.array([get_review_vector(tokens, w2v_model) for tokens in df['tokens']])


In [137]:
# ReLU activation
def relu(Z):
    return np.maximum(0, Z)

# Derivative of ReLU
def relu_derivative(Z):
    return (Z > 0).astype(float)

# Softmax activation
def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))  # stability trick
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

# Cross-entropy loss
def compute_loss(Y, Y_hat):
    m = Y.shape[0]
    loss = -np.sum(Y * np.log(Y_hat + 1e-9)) / m
    return loss

# Accuracy
def compute_accuracy(Y, Y_hat):
    Y_pred = np.argmax(Y_hat, axis=1)
    Y_true = np.argmax(Y, axis=1)
    return np.mean(Y_pred == Y_true)

In [138]:
class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, learning_rate=0.01):
        self.learning_rate = learning_rate
        
        # Initialize weights and biases
        self.weights = []
        self.biases = []
        
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(len(layer_sizes)-1):
            W = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * 0.01
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(W)
            self.biases.append(b)
    
    # Forward pass
    def forward(self, X):
        self.Zs = []
        self.As = [X]
        
        # Hidden layers
        for i in range(len(self.weights)-1):
            Z = self.As[-1] @ self.weights[i] + self.biases[i]
            A = relu(Z)
            self.Zs.append(Z)
            self.As.append(A)
        
        # Output layer
        Z = self.As[-1] @ self.weights[-1] + self.biases[-1]
        A = softmax(Z)
        self.Zs.append(Z)
        self.As.append(A)
        return A
    
    # Backward pass
    def backward(self, Y):
        m = Y.shape[0]
        grads_W = []
        grads_b = []
        
        # Output layer gradient
        dZ = self.As[-1] - Y  # shape: (m, output_size)
        for i in reversed(range(len(self.weights))):
            A_prev = self.As[i]
            dW = (A_prev.T @ dZ) / m
            db = np.sum(dZ, axis=0, keepdims=True) / m
            grads_W.insert(0, dW)
            grads_b.insert(0, db)
            
            if i != 0:
                dA_prev = dZ @ self.weights[i].T
                dZ = dA_prev * relu_derivative(self.Zs[i-1])
        
        # Update weights and biases
        for i in range(len(self.weights)):
            self.weights[i] -= self.learning_rate * grads_W[i]
            self.biases[i] -= self.learning_rate * grads_b[i]
    
    # Training function
    def train(self, X, Y, epochs=50, batch_size=32, X_val=None, Y_val=None):
        for epoch in range(1, epochs+1):
            # Shuffle data
            idx = np.random.permutation(X.shape[0])
            X_shuffled = X[idx]
            Y_shuffled = Y[idx]
            
            # Mini-batch gradient descent
            for i in range(0, X.shape[0], batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                Y_batch = Y_shuffled[i:i+batch_size]
                Y_hat = self.forward(X_batch)
                self.backward(Y_batch)
            
            # Compute loss and accuracy for the epoch
            Y_hat_train = self.forward(X)
            loss = compute_loss(Y, Y_hat_train)
            acc = compute_accuracy(Y, Y_hat_train)
            
            if X_val is not None and Y_val is not None:
                Y_hat_val = self.forward(X_val)
                val_loss = compute_loss(Y_val, Y_hat_val)
                val_acc = compute_accuracy(Y_val, Y_hat_val)
                print(f"Epoch {epoch}: Loss={loss:.4f}, Acc={acc:.4f}, Val_Loss={val_loss:.4f}, Val_Acc={val_acc:.4f}")
            else:
                print(f"Epoch {epoch}: Loss={loss:.4f}, Acc={acc:.4f}")
    
    # Predict function
    def predict(self, X):
        Y_hat = self.forward(X)
        return np.argmax(Y_hat, axis=1)


In [139]:
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [119]:
input_size = X_train.shape[1]  # number of TF-IDF features
hidden_sizes = [512, 256, 128, 64]       # example hidden layers
output_size = y_train.shape[1] # 5 classes

mlp = MLP(input_size, hidden_sizes, output_size, learning_rate=0.001)
mlp.train(X_train, y_train, epochs=200, batch_size=150, X_val=X_test, Y_val=y_test)

# Make predictions
y_pred = mlp.predict(X_test)

# Compute final accuracy
accuracy = np.mean(y_pred == np.argmax(y_test, axis=1))
print("Test Accuracy:", accuracy)


Epoch 1: Loss=1.6067, Acc=0.3538, Val_Loss=1.6068, Val_Acc=0.3486
Epoch 2: Loss=1.6040, Acc=0.3538, Val_Loss=1.6041, Val_Acc=0.3486
Epoch 3: Loss=1.6014, Acc=0.3538, Val_Loss=1.6015, Val_Acc=0.3486
Epoch 4: Loss=1.5988, Acc=0.3538, Val_Loss=1.5990, Val_Acc=0.3486
Epoch 5: Loss=1.5963, Acc=0.3538, Val_Loss=1.5965, Val_Acc=0.3486
Epoch 6: Loss=1.5938, Acc=0.3538, Val_Loss=1.5940, Val_Acc=0.3486
Epoch 7: Loss=1.5913, Acc=0.3538, Val_Loss=1.5916, Val_Acc=0.3486
Epoch 8: Loss=1.5888, Acc=0.3538, Val_Loss=1.5892, Val_Acc=0.3486
Epoch 9: Loss=1.5864, Acc=0.3538, Val_Loss=1.5868, Val_Acc=0.3486
Epoch 10: Loss=1.5841, Acc=0.3538, Val_Loss=1.5845, Val_Acc=0.3486
Epoch 11: Loss=1.5818, Acc=0.3538, Val_Loss=1.5822, Val_Acc=0.3486
Epoch 12: Loss=1.5795, Acc=0.3538, Val_Loss=1.5800, Val_Acc=0.3486
Epoch 13: Loss=1.5772, Acc=0.3538, Val_Loss=1.5778, Val_Acc=0.3486
Epoch 14: Loss=1.5750, Acc=0.3538, Val_Loss=1.5756, Val_Acc=0.3486
Epoch 15: Loss=1.5728, Acc=0.3538, Val_Loss=1.5734, Val_Acc=0.3486
Epoc

KeyboardInterrupt: 