Step 0: install dependencies

In [None]:
!pip uninstall -y numpy pandas
!pip install numpy==1.26.0 pandas==2.2.2
!pip install nltk keras gensim scikit-learn contractions

# Task 2 of Mini Project: Text Classification using Deep Learning
1. Data Loading

In [None]:
# upload ur dataset, must contain columns "input" for course descriptions and "prediction" for course categories
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from google.colab import files
uploaded  = files.upload()

# replace filename with actual uploaded file name
data = pd.read_csv(next(iter(uploaded)))

print("dataset overview:")
print(data.info())
print("\n sample data:")
print(data.head())

# plot class distribution
plt.figure(figsize=(8,5))
data['Prediction'].value_counts().plot(kind='bar', color='skyblue')
plt.title('course category distribution')
plt.xlabel('course category')
plt.ylabel('count')
plt.show()

# split data into train, validation, test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data  = train_test_split(train_data, test_size=0.1, random_state=42)

# plot pie chart for overall class distribution
class_distribution = data['Prediction'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(class_distribution, labels=class_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title("course category distribution")
plt.show()


2. Text Preprocessing

In [None]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_description(text):
    # expand contractions
    text = contractions.fix(text)
    # remove special characters numbers, convert lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    # tokenize text
    words = word_tokenize(text)
    # remove stopwords and lemmatize; ignore words shorter than 3 characters
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

train_data['Processed_Description'] = train_data['Input'].apply(preprocess_description)
val_data['Processed_Description']   = val_data['Input'].apply(preprocess_description)
test_data['Processed_Description']  = test_data['Input'].apply(preprocess_description)

print("\npreprocessed sample:")
print(train_data[['Input', 'Processed_Description']].head())

3. Text Embedding

In [None]:
import gensim
import numpy as np
import torch
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import BertTokenizer, BertModel

# bow vectorizer setup
bow_vectorizer = CountVectorizer(max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(train_data['Processed_Description']).toarray()
X_val_bow   = bow_vectorizer.transform(val_data['Processed_Description']).toarray()
X_test_bow  = bow_vectorizer.transform(test_data['Processed_Description']).toarray()

# tfidf vectorizer setup
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Processed_Description']).toarray()
X_val_tfidf   = tfidf_vectorizer.transform(val_data['Processed_Description']).toarray()
X_test_tfidf  = tfidf_vectorizer.transform(test_data['Processed_Description']).toarray()

# tokenize texts for fasttext/word2vec; convert processed description into list words
train_tokens = train_data['Processed_Description'].apply(str.split)
val_tokens   = val_data['Processed_Description'].apply(str.split)
test_tokens  = test_data['Processed_Description'].apply(str.split)

# fasttext embeddings; train on our corpus
fasttext_model = gensim.models.FastText(train_tokens, vector_size=100, window=5, min_count=5)
X_train_fasttext = np.array([ np.mean([ fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in train_tokens])
X_val_fasttext   = np.array([ np.mean([ fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in val_tokens])
X_test_fasttext  = np.array([ np.mean([ fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in test_tokens])

# word2vec embeddings; custom trained
word2vec_model = gensim.models.Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=5)
X_train_word2vec = np.array([ np.mean([ word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in train_tokens])
X_val_word2vec   = np.array([ np.mean([ word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in val_tokens])
X_test_word2vec  = np.array([ np.mean([ word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv ] or [np.zeros(100)], axis=0)
                              for tokens in test_tokens])

# bert embeddings
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


4. Model Training with Different Architectures

In [None]:
# model training with deep learnin architectures
from keras.models import Sequential, Model
from keras.layers import Dense, Conv1D, MaxPooling1D, LSTM, Bidirectional, Embedding, Flatten, MultiHeadAttention, LayerNormalization, Add, Input
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau

# encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_data['Prediction'])
y_val   = encoder.transform(val_data['Prediction'])
y_test  = encoder.transform(test_data['Prediction'])

# define diff models
def cnn_model(input_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim, 128, input_length=input_dim))
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    opt = Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def lstm_model(input_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim, 128, input_length=input_dim))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def cnn_bilstm_model(input_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim, 128, input_length=input_dim))
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def transformer_model(input_dim, num_classes):
    inp = Input(shape=(input_dim,))
    emb = Embedding(input_dim=input_dim, output_dim=128)(inp)
    x   = LSTM(100, return_sequences=True)(emb)
    att = MultiHeadAttention(num_heads=2, key_dim=64)(x, x)
    att = LayerNormalization()(att)
    x   = Add()([x, att])
    x   = Flatten()(x)
    x   = Dense(100, activation='relu')(x)
    out = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def fully_connected_model(input_dim, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim, 128, input_length=input_dim))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# set input dims based on embed type
# for models using embedding layer, use fixed input dim (like 5000)
input_dim_fixed = 5000
num_classes = len(encoder.classes_)

# for vectorizer/transformer generated embeddings use their dims
input_dim_bow       = X_train_bow.shape[1]
input_dim_tfidf     = X_train_tfidf.shape[1]
input_dim_fasttext  = X_train_fasttext.shape[1]
input_dim_word2vec  = X_train_word2vec.shape[1]
input_dim_bert      = X_train_bert.shape[1]

# lr scheduller
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

# train cnn model using bow features (eg)
cnn = cnn_model(input_dim_bow, num_classes)
cnn.fit(X_train_bow, y_train, validation_data=(X_val_bow, y_val), epochs=5, batch_size=64, callbacks=[lr_scheduler])

# train lstm model using bow features
lstm = lstm_model(input_dim_fixed, num_classes)
lstm.fit(X_train_bow, y_train, validation_data=(X_val_bow, y_val), epochs=5, batch_size=64)

# train cnn-bilstm model using bow features
cnn_bilstm = cnn_bilstm_model(input_dim_fixed, num_classes)
cnn_bilstm.fit(X_train_bow, y_train, validation_data=(X_val_bow, y_val), epochs=5, batch_size=64)

# train transformer model using word2vec embeddings
transformer_word2vec = transformer_model(input_dim_word2vec, num_classes)
transformer_word2vec.fit(X_train_word2vec, y_train, validation_data=(X_val_word2vec, y_val), epochs=5, batch_size=64)

# train fully connected model using bert embeddings
fully_connected_bert = fully_connected_model(input_dim_bert, num_classes)
fully_connected_bert.fit(X_train_bert, y_train, validation_data=(X_val_bert, y_val), epochs=5, batch_size=64)

print("all models trained succesfully!")

5. Comparative Analysis of Models

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# evaluate cnn model
y_pred_cnn = cnn.predict(X_test_bow)
cnn_acc = accuracy_score(y_test, y_pred_cnn.argmax(axis=1))
print("cnn acc:", cnn_acc)
print("cnn clas report:\n", classification_report(y_test, y_pred_cnn.argmax(axis=1), zero_division=0))

# evaluate lstm model
y_pred_lstm = lstm.predict(X_test_bow)
lstm_acc = accuracy_score(y_test, y_pred_lstm.argmax(axis=1))
print("lstm acc:", lstm_acc)
print("lstm clas report:\n", classification_report(y_test, y_pred_lstm.argmax(axis=1), zero_division=0))

# evaluate cnn-bilstm model
y_pred_cnn_bilstm = cnn_bilstm.predict(X_test_bow)
cnn_bilstm_acc = accuracy_score(y_test, y_pred_cnn_bilstm.argmax(axis=1))
print("cnn-bilstm acc:", cnn_bilstm_acc)
print("cnn-bilstm clas report:\n", classification_report(y_test, y_pred_cnn_bilstm.argmax(axis=1), zero_division=0))

# evaluate transformer (word2vec) model
y_pred_transformer = transformer_word2vec.predict(X_test_word2vec)
transformer_acc = accuracy_score(y_test, y_pred_transformer.argmax(axis=1))
print("transformer (word2vec) acc:", transformer_acc)
print("transformer (word2vec) clas report:\n", classification_report(y_test, y_pred_transformer.argmax(axis=1), zero_division=0))

# evaluate fully connected (bert) model
y_pred_fc_bert = fully_connected_bert.predict(X_test_bert)
fc_bert_acc = accuracy_score(y_test, y_pred_fc_bert.argmax(axis=1))
print("fully connected (bert) acc:", fc_bert_acc)
print("fully connected (bert) clas report:\n", classification_report(y_test, y_pred_fc_bert.argmax(axis=1), zero_division=0))

# summarise results in table
results = {
    "model": ["cnn", "lstm", "cnn-bilstm", "transformer (word2vec)", "fully connected (bert)"],
    "acc": [cnn_acc, lstm_acc, cnn_bilstm_acc, transformer_acc, fc_bert_acc]
}

results_df = pd.DataFrame(results)
print("\n model performance summary:")
print(results_df)


6. Saving the Best Model

In [None]:

# save best model (cnn perfmed best
cnn.save('best_online_course_classifier.h5')
print("best model saved as 'best_online_course_classifier.h5'")



7. Prediction on Real-World Input

In [None]:
from keras.models import load_model
import numpy as np

# load saved model
model = load_model('best_online_course_classifier.h5')

# new input: update course description if needed
new_text = "Learn Python programming with real-world projects and interactive coding exercises."
# preproc new input
processed_text = preprocess_description(new_text)
# convert to bow features (if best model trained on bow)
new_text_features = bow_vectorizer.transform([processed_text]).toarray()

# predict class
predictions = model.predict(new_text_features)
predicted_class = encoder.inverse_transform([predictions.argmax(axis=1)[0]])[0]
confidence = predictions.max() * 100
print(f"predicted course category: {predicted_class} (confidence: {confidence:.2f}%)")
