In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.regularizers import l2

Prepare the Dataset

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
filepath_2023 = "/content/gdrive/MyDrive/Colab Notebooks/data/2023 S_S COURSE.xlsx"
filepath_2022 = "/content/gdrive/MyDrive/Colab Notebooks/data/2022 F_S COURSE.xlsx"

In [None]:
df = pd.read_excel(filepath_2023)
df1 = pd.read_excel(filepath_2022)

In [None]:
df

Unnamed: 0,Course Name,Unit,Job Label
0,Political science,Political Science,Diplomat
1,Political science,Political Science,Lawyer
2,Political science,Political Science,Politician
3,Economics,Economics,Accountant
4,Economics,Economics,Bank Staff
...,...,...,...
1537,Cultural Creativity Adding Value,Digital Content and Technologies/B/4,Engineer
1538,Computer Animation,Digital Content and Technologies/B/4,Engineer
1539,Interaction Technologies Research and Discussion,Digital Content and Technologies/B/4,Engineer
1540,Artificial Intelligence and Digital Content,Digital Content and Technologies/B/4,Engineer


In [None]:
df1

Unnamed: 0,Course Name,Unit,Job Label
0,Macroeconomics,Economics,Economist
1,Macroeconomics,Economics,Supply Chain Manager
2,Economics,Economics,Economist
3,Economics,Economics,Supply Chain Manager
4,Public Finance,Public Finance,Bank staff
...,...,...,...
1443,Contemporary Aesthetics and Curation,Digital Content and Technologies/B/4,Engineer
1444,Special Projects on Digital Content and Techno...,Digital Content and Technologies/B/4,Engineer
1445,Video Pos-production and Visual Effects,Digital Content and Technologies/B/4,Engineer
1446,Blockchain Application Development,Digital Content and Technologies/B/4,Engineer


In [None]:
df_combined = pd.concat([df[['Job Label', 'Course Name', 'Unit']], df1[['Job Label', 'Course Name', 'Unit']]])

In [None]:
df_combined['Course Name'] = df_combined['Course Name'].str.lower()
df_combined['Job Label'] = df_combined['Job Label'].str.lower()

In [None]:
df_combined

Unnamed: 0,Job Label,Course Name,Unit
0,diplomat,political science,Political Science
1,lawyer,political science,Political Science
2,politician,political science,Political Science
3,accountant,economics,Economics
4,bank staff,economics,Economics
...,...,...,...
1443,engineer,contemporary aesthetics and curation,Digital Content and Technologies/B/4
1444,engineer,special projects on digital content and techno...,Digital Content and Technologies/B/4
1445,engineer,video pos-production and visual effects,Digital Content and Technologies/B/4
1446,engineer,blockchain application development,Digital Content and Technologies/B/4


Data Processing

In [None]:
encoder = LabelEncoder()
df_combined['encoded_job'] = encoder.fit_transform(df_combined['Job Label'])
num_classes = len(encoder.classes_)

In [None]:
train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42)

In [None]:
train_df['Course Name'] = train_df['Course Name'].astype(str)
test_df['Course Name'] = test_df['Course Name'].astype(str)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['Course Name'])
vocab_size = len(tokenizer.word_index) + 1

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_df['Course Name'])
test_sequences = tokenizer.texts_to_sequences(test_df['Course Name'])

In [None]:
max_sequence_length = max(len(seq) for seq in train_sequences)
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

In [None]:
num_features = max_sequence_length
num_classes = len(df_combined['Course Name'].unique())

In [None]:
num_classes = len(encoder.classes_)
train_labels = to_categorical(train_df['encoded_job'], num_classes=num_classes)
test_labels = to_categorical(test_df['encoded_job'], num_classes=num_classes)

Build DNN Model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_sequence_length))
model.add(LSTM(256, dropout=0.5, recurrent_dropout=0.5, return_sequences=True, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))
model.add(LSTM(256, dropout=0.5, recurrent_dropout=0.5, return_sequences=True, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_data, train_labels, epochs=20, batch_size=32, validation_data=(test_data, test_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f5e3c547cd0>

Evaluate Model

In [None]:
test_labels = to_categorical(test_df['encoded_job'], num_classes)
loss, accuracy = model.evaluate(test_data, test_labels, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 2.5960206985473633
Test Accuracy: 0.3193979859352112


Recommendation

In [None]:
unique_job_labels = set(df_combined['Job Label'].str.lower().str.strip()) - {'nan'}

print('=== Job Recommendation Chatbot ===')
print('Hi! Welcome to the Job Recommendation Chatbot.')
print('Please enter one of the following job labels:')

for job_label in unique_job_labels:
    print(job_label)

user_job = input('Enter your job label (or type "exit" to quit): ').lower()

if user_job == 'exit':
    print('Thank you for using the Job Recommendation Chatbot. Goodbye!')
else:
    while True:
        if user_job not in unique_job_labels:
            print('Invalid job label. Please enter a valid job label from the list.')
        else:
            encoded_job = encoder.transform([user_job])
            prediction = model.predict(pad_sequences([encoded_job], maxlen=max_sequence_length))
            predicted_class = prediction.argmax(axis=1)
            recommended_courses = df_combined.loc[df_combined['encoded_job'] == encoded_job[0], ['Course Name', 'Unit']]

            if recommended_courses.empty:
                print('No recommended courses found for the given job label.')
            else:
                random_courses = random.sample(recommended_courses.values.tolist(), k=15)
                print('Recommended Courses:')
                for course, unit in random_courses:
                    print(f'{course} ({unit})')

        user_job = input('Enter your job label (or type "exit" to quit): ').lower()

        if user_job == 'exit':
            print('Thank you for using the Job Recommendation Chatbot. Goodbye!')
            break


=== Job Recommendation Chatbot ===
Hi! Welcome to the Job Recommendation Chatbot.
Please enter one of the following job labels:
journalist
designer
accountant
economist
politician
historian
diplomat
public relations specialist
marketer
sociologist
lawyer
supply chain manager
insurance solicitor
human resources generalist
researcher
consultant
professor
director
entrepreneur
therapist
interpreter
copywriter
finance officer
project manager
engineer
bank staff
bureaucrat
cyber security
architect
real estate
hotel staff
