## Name: Fareed Hassan Khan
## ERP ID: 25367
## Text Analytics Final Project Notebook Code 

Importing Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import openai
from getpass import getpass
from openai.embeddings_utils import get_embedding
import gzip

# davinchi model for chatbot.
EMBEDDINGS_MODEL = "text-embedding-ada-002"

openai.api_key = getpass("Enter your OpenAI API Key")

Scraping Total number of faculty profiles along with their links 

In [None]:
# The base URL for the faculty profiles page
base_url = "https://www.iba.edu.pk/faculty-profiles.php"

faculty_list_with_department = []

links = []

# The available options for faculty type and department
faculty_options = ["Fulltime+Faculty", "Visiting Faculty"]
faculty_options = [s.replace(" ", "+") for s in faculty_options]
department_options = [s.replace(" ", "+") for s in ['Accounting & Law', 'Computer Science', 'Economics', 'Finance', 'Management', 'Marketing', 
                                                    'Mathematical Sciences', 'Social Sciences & Liberal Arts']]

faculty_list_with_department = [f"{base_url}?school=&facultylist={f}&departmentlist={d}&faculty_name=" for f in faculty_options for d in department_options] + \
                                [f"{base_url}?school=&facultylist={f}&departmentlist=&faculty_name=" for f in faculty_options] + \
                                [f"{base_url}?school=&facultylist=&departmentlist={d}&faculty_name=" for d in department_options] + \
                                [f"{base_url}"]

for each_link in faculty_list_with_department:
  response = requests.get(each_link)
  soup = BeautifulSoup(response.content, "html.parser")

  faculty_cards = soup.find_all("div", {"class": "faculty-profile-card"})

  for card in faculty_cards:
      profile_link = card.find("span").find("a")["href"]
      links.append('https://www.iba.edu.pk/' + profile_link)

Printing total number of profiles on IBA website

In [2]:
import pickle

# Load the list back using pickle
with open('profiles_list.pkl', 'rb') as f:
    total_unique_profiles = pickle.load(f)

# total_unique_profiles = list(set(links))
print(f'Total number of profiles are {len(total_unique_profiles)}')

Total number of profiles are 478


Getting complete Information of each profile

In [None]:
names = []
basic_information = []
main_list = []
for each_profile in total_unique_profiles:
    response = requests.get(each_profile)
    soup = BeautifulSoup(response.content, 'html.parser')
    basic_info = soup.find('div', {'class': 'faculty-profile-info'})
    name, position = [elem.text.strip() for elem in basic_info.find_all(['h1', 'h2'])]
    department, specialization, onboard_status, email = [elem.text.strip().replace(f, '') for elem, f in zip(basic_info.find_all('h3'), ['Department:', 'Specialization:', 'Onboard Status:', 'Email:'])]
    output = f"{name} is an {position} at the IBA, Karachi. Specializing in {specialization} and belongs to department of {department}. Currently {onboard_status} for onboard opportunities and can be contacted via email at {email}." if specialization else f'{name} is an {position} at the IBA Karachi. Currently available for onboard opportunities and can be contacted via email at {email}.'
    names.append(name)
    basic_information.append(output)

    info = {}

    for section in ['overview', 'qualification', 'interests', 'courses', 'output', 'projects']:
        if section != 'output':
            items = soup.find('div', {'id': section}).find_all('li')
            if len(items) > 0:
                info[section] = [item.text for item in items]
            else:
                info[section] = [f'No {section} found in the profile provided by IBA']
        if section == 'projects':
            items = soup.find('div', {'id': 'projects'}).find_all('li')
            if len(items) > 0:
                info[section] = [item.get_text(strip=True) for item in items]
            else:
                info[section] = [f'No {section} found in the profile provided by IBA']
        else:
            for h4 in soup.find('div', {'id': section}).find_all('h4')[1:]:
                key = h4.text.strip()
                ul = h4.find_next('ul').find_all('li')
                if len(ul) > 0:
                    for each in range(0,len(ul)):
                        ul[each] = ul[each].text.strip()
                    info[key] = ul
                else:
                    info[key] = [f'No {h4.text.strip()} found in the profile provided by IBA']
    main_list.append(info)

Converting the final output into a Dataframe

In [None]:
df = pd.DataFrame({'names':names, 'basic_information':basic_information, 'profile_link':total_unique_profiles, 'complete information':main_list})

Reading the saved Dataframe

In [2]:
df = pd.read_csv('finalized_dataframe.csv')
df.head()

Unnamed: 0,names,basic_information,profile_link,complete information
0,Syed Ameer Hasan Rizvi,Syed Ameer Hasan Rizvi is an Lecturer at the I...,https://www.iba.edu.pk/faculty-profile.php?fty...,{'overview': ['Lecturer at Institute of Busine...
1,Sayem Ali,"Sayem Ali is an Visiting Faculty at the IBA, K...",https://www.iba.edu.pk/faculty-profile.php?fty...,{'overview': ['- Consultant/Financial Markets ...
2,Dr. Naved Ahmad,"Dr. Naved Ahmad is an Professor at the IBA, Ka...",https://www.iba.edu.pk/faculty-profile.php?fty...,{'overview': ['Professor at Institute of Busin...
3,Summer Qassim,"Summer Qassim is an Lecturer at the IBA, Karac...",https://www.iba.edu.pk/faculty-profile.php?fty...,{'overview': ['Lecturer at Institute of Busine...
4,,is an at the IBA Karachi. Currently availabl...,https://www.iba.edu.pk/faculty-profile.php?fty...,{'overview': ['No overview found in the profil...


Removing any NAN and duplicate values

In [3]:
# drop row that contain NaN in the 'names' column.
df.dropna(subset=['names'], inplace=True)
# drop duplicate rows from names column
df.drop_duplicates(subset=['basic_information'], inplace=True)

Converting datatype to original formats

In [4]:
from ast import literal_eval

def try_literal_eval(s):
    try:
        return literal_eval(s)
    except ValueError:
        return s
    
df['complete information'] = df['complete information'].apply(try_literal_eval)
# remove double spaces using regex on complete dataframe.
df['basic_information'] = df['basic_information'].str.replace('  ', ' ')

Making the information readable for ChatGPT

In [5]:
df['finalized_text'] = df['complete information'].apply(lambda x: {k: '\n'.join(v) for k, v in x.items()})

# define a lambda function to replace 'overview' with 'professional experience'
replace_over_with_prof = lambda x: {k.replace('overview', 'professional experience'): v for k, v in x.items()}

# apply the lambda function to each dictionary in the DataFrame
df['finalized_text'] = df['finalized_text'].apply(replace_over_with_prof)

for each in range(0,df.shape[0]):
    df['finalized_text'].iloc[each]['professional experience'] = df['names'].iloc[each] + ' has following experience:\n' + df['finalized_text'].iloc[each]['professional experience']
    df['finalized_text'].iloc[each]['qualification'] = df['names'].iloc[each] + ' has following qualifications:\n' + df['finalized_text'].iloc[each]['qualification']
    df['finalized_text'].iloc[each]['interests'] = df['names'].iloc[each] + ' has following interests:\n' + df['finalized_text'].iloc[each]['interests']
    df['finalized_text'].iloc[each]['courses'] = df['names'].iloc[each] + ' teaches following courses:\n' + df['finalized_text'].iloc[each]['courses']
    df['finalized_text'].iloc[each]['Journal Publication(s)'] = df['names'].iloc[each] + ' has following Journal Publications:\n' + df['finalized_text'].iloc[each]['Journal Publication(s)']
    df['finalized_text'].iloc[each]['Conference(s)'] = df['names'].iloc[each] + ' has following conferences:\n' + df['finalized_text'].iloc[each]['Conference(s)']
    df['finalized_text'].iloc[each]['Other(s)'] = df['names'].iloc[each] + ' other kind of information:\n' + df['finalized_text'].iloc[each]['Other(s)']
    df['finalized_text'].iloc[each]['Book(s)'] = df['names'].iloc[each] + ' has written following books:\n' + df['finalized_text'].iloc[each]['Book(s)']
    df['finalized_text'].iloc[each]['Case(s)'] = df['names'].iloc[each] + ' has following case studies:\n' + df['finalized_text'].iloc[each]['Case(s)']
    df['finalized_text'].iloc[each]['projects'] = df['names'].iloc[each] + ' has done following projects:\n' + df['finalized_text'].iloc[each]['projects']

Creating a finalized Dataframe for embedding calculation

In [7]:
information_profile = []

for each in range(0,df.shape[0]):
    temporary_data = pd.DataFrame({'information' : df.iloc[each].finalized_text})
    temporary_data.reset_index(inplace=True, drop=True)
    information_profile.append(temporary_data)

finalized_data = pd.concat(information_profile, axis=0, ignore_index=True)

Pricing calculation for embedding

In [107]:
col1_str = ' '.join(finalized_data['information'].astype(str))
total_words = len(col1_str.split())

Tokens = 3/4 * total_words
price_per_thousand_tokens = 0.0004

print(f'Total embedding cost is $0.12 dollar equivalent to 40 PKR') 

Total embedding cost is $0.12 dollar equivalent to 40 PKR


Calculating Embedding

In [None]:
def apply_embedding(row):
    print('done')
    embedding_vector = get_embedding(row, engine='text-embedding-ada-002')
    return embedding_vector

embedded_data = pd.read_csv('finalized_data_with_embedding.csv')
embedded_data['embedding'] = embedded_data['embedding'].apply(try_literal_eval)

Saving information data and embedded vectors in compressed pickle file to save memory **because finalized data is of 150 MB** 

In [5]:
# read embedded dataframe.
embedded_data = pd.read_csv('finalized_data_with_embedding.csv')
embedded_data['embedding'] = embedded_data['embedding'].apply(try_literal_eval)

# saving the information is an array to retrieve it faster
information_array = np.array(embedded_data.information)
embedding_array = np.array(embedded_data.embedding)

Saving both the information and embedded vectors in compressed files and reading it

In [6]:
data_arrays = np.array([embedding_array, information_array])

# # compress the array and save it to disk
with gzip.open('vectorized_data.npy.gz', 'wb') as f:
    np.save(f, data_arrays)

In [None]:
with gzip.open('vectorized_data.npy.gz', 'rb') as f:
    data_array = np.load(f, allow_pickle=True)

Asking the question

In [34]:
question = "who is the program director of MS IBF program?"


question_vector = get_embedding(question, engine='text-embedding-ada-002')

question_vector_transform = np.array(question_vector).reshape(1, -1)

Finding Cosine Similarity of question with information

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = []

for each in data_array[0]:
    v2 = np.array(each).reshape(1, -1)
    # compute the cosine similarity
    cosinesimilarity = cosine_similarity(question_vector_transform, v2)[0][0]
    similarity.append(cosinesimilarity)

similarity = np.array(similarity)

Sorting the information based on cosine similarity

In [None]:
# Get the sorted indices of y in descending order
sorted_indices = np.argsort(-similarity)

# Sort x and y based on sorted_indices
information_this = ' '.join(data_array[1][sorted_indices[0:3]])
cosinesimilarity_this = similarity[sorted_indices[0]]

Using GPT-3.5 (Text-davinchi model) for answering the question using context and prompt

In [38]:
prompt = f"""Answer only IBA related question using only the context below, Answer it in professional way, if you are not able to answer the question make sure you reply that you have trained on a very small dataset, dont provide any information about the context
Context:
{information_this}

Q: {question}
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=1,
    max_tokens=500,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model='text-davinci-003'
)["choices"][0]["text"].strip(" \n")

'Dr. Irum Saba is the Program Director of MS-IBF at Institute of Business Administration, Karachi (Mar-2021 to Present).'

____________