In [1]:
import pandas as pd
import os
import glob
import openpyxl
import warnings
import re
import tensorflow_hub as hub
import difflib
import numpy as np
import math
from sklearn.metrics.pairwise import linear_kernel
import pickle
import nltk
nltk.download('punkt',quiet=True)
nltk.download("stopwords",quiet=True)
import string 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

warnings.simplefilter(action='ignore', category=UserWarning)

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

class Course_Loader:
    course_names = {}
    course_codes = {}
    names_list = []
    codes_list = []
    df = -1
    newEntries = False
    cosine_sim = -1
    indices = -1

    def __init__(self):
        data = ['Course Name','Course Code','Course Description']
        self.df = pd.DataFrame(columns = data)
        self.course_codes = {}
        self.course_names = {}
        self.names_list = []
        self.codes_list = []
        self.newEntries = False

    def getDataset(self):
        return self.df
    
    def addCourse(self, f):
        newCourse = Course(f)
        self.course_names[newCourse.name] = newCourse
        self.course_codes[newCourse.code] = newCourse
        new_df = pd.DataFrame({"Course Name":[newCourse.name],"Course Code":[newCourse.code], "Course Description":[newCourse.description]})
        self.df = self.df.append(new_df,ignore_index=True)
        self.newEntries = True # Keeps track of if new entries were added after calculating the similarity check 

    def findbyCode(self,code, exact = False):
        ans = []
        if(not exact):
            code = "".join(code.lower().split())
            out = difflib.get_close_matches(code, self.codes_list, cutoff= 0.0000001, n = 10)
            for i in out:
                ans.append(self.course_codes[i])
        if(code in self.course_codes.keys()):
            ans.append(self.course_codes[code])
        return ans
    
    def findbyName(self,name, exact = False):
        ans = []
        if(not exact):
            name = "".join(name.lower().split())
            out = difflib.get_close_matches(name, self.names_list, cutoff= 0.0000001, n = 10)
            for i in out:
                ans.append(self.course_names[i])
        if(name in self.course_names.keys()):
            ans.append(self.course_names[name])
        return ans

    def initialize_recommender(self):
        if(self.newEntries):
            self.newEntries = False
            self.df = self.df.dropna()
            self.df = self.df.drop_duplicates(subset='Course Name', keep='first')
            self.df = self.df.reset_index(drop=True)
            self.names_list = self.df['Course Name'].unique()
            self.codes_list = self.df['Course Code'].unique()
            encodings = embed(self.df['Course Description'])
            matrix = np.vstack(encodings)
            cosine_sim = linear_kernel(matrix, matrix)
            indices = pd.Series(self.df.index, index=self.df['Course Name'])
            self.cosine_sim = cosine_sim
            self.indices = indices

    def get_recommendations(self, title):
        idx = self.indices[title]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        return sim_scores
    
    def combine_recommendations(self,titles):
        out = []
        self.initialize_recommender()
        for i in range(len(self.df)):
            out.append([i,0])
        for org_title in titles:            #### Names may or may not be present in the dataset, finds the closest title matching in the dataset
            title = difflib.get_close_matches(org_title, self.names_list, cutoff= 0.0000001, n = 1)[0]  #Searching the closest match in the dataset 
            cur_out = self.get_recommendations(title)
            for score in range(len(cur_out)):
                out[score][1] += cur_out[score][1]
        final_score = sorted(out, key=lambda x: x[1], reverse=True)
        df_indices = [i[0] for i in final_score][1:10]
        result = self.df.iloc[df_indices]
        return result
                     
class Course:

    course_name_list = set([])
    file_name = ""
    code = ""
    description = ""
    embedding = []
    similarity_matrix = []
    file_ptr = ""
    name = ""

    def pre_process_text(self, new_s):
        new_s = ".".join(new_s.split("\n"))
        #new_s = " ".join(new_s.split("-"))
        new_s = new_s.lower()
        # translate_table = dict((ord(char), " ") for char in string.punctuation)   
        # new_s = new_s.translate(translate_table)
        # li = word_tokenize(new_s)
        # stop_words = set(stopwords.words("english"))
        # filter_li = []
        # for words in li:
        #     if(words not in stop_words):
        #         filter_li.append(words)
        # ans = " ".join(filter_li)
        ans = new_s
        return ans

    def __init__(self, file_pointer):
        self.file_name = str(f)
        self.file_ptr = file_pointer
        self.description = self.find_description() + " " + self.find_course_plan()
        self.code = self.find_code()
        self.name = self.find_name()


    def find_description(self):
        workbook = openpyxl.load_workbook(self.file_ptr)
        worksheet = workbook.active
        for coll in range(1,6):
            for roww in range(1,20):
                if("description" in "".join(re.sub(r'[^\w\s]','', re.sub(r'\d+','',str(worksheet.cell(row=roww, column=coll).value).lower())).split())):
                    return str(worksheet.cell(row=roww, column=coll+1).value)
        return "-1"
    
    def find_code(self):
        f = self.file_ptr
        workbook = openpyxl.load_workbook(f)
        worksheet = workbook.active
        for coll in range(1,4):
            for roww in range(1,5):
                cell_val = "".join("".join(str(worksheet.cell(row=roww, column=coll).value).lower().strip().split()).split("-"))
                if('code' in cell_val):
                    cell_val = " ".join("".join(str(worksheet.cell(row=roww, column=coll+1).value).lower().strip().split()).split("-"))
                    return cell_val
        #If not found in the sheet, find in file_name 
        file_name = str(f).split("\\")[-1]
        file_name = file_name.split("/")[-1]
        file_name = file_name.split(".")
        if(len(file_name) >= 2):
            file_name = file_name[-2]
        file_name = "".join(file_name.lower().split())
        return file_name 
    
    def find_name(self):
        f = self.file_ptr
        workbook = openpyxl.load_workbook(f)
        worksheet = workbook.active
        for coll in range(1,4):
            for roww in range(1,5):
                cell_val = "".join("".join(str(worksheet.cell(row=roww, column=coll).value).lower().strip().split()).split("-"))
                if('name' in cell_val):
                    cell_val = " ".join("".join(str(worksheet.cell(row=roww, column=coll+1).value).lower().strip().split()).split("-"))
                    return cell_val
        #If not found in the sheet, find in file_name 
        file_name = str(f).split("\\")[-1]
        file_name = file_name.split("/")[-1]
        file_name = file_name.split(".")
        if(len(file_name) >= 2):
            file_name = file_name[-2]
        file_name = "".join(file_name.lower().split())
        return file_name
    
    def find_course_plan(self):
        f = self.file_ptr
        workbook = openpyxl.load_workbook(f)
        worksheet = workbook.active
        lec_cell = (-1,-1)
        for coll in range(1,20):
            for roww in range(1,41):
                cell_val = "".join("".join(str(worksheet.cell(row=roww, column=coll).value).lower().strip().split()).split("-"))
                if(len(cell_val) >= 2 and len(cell_val) <= 30 and 'lecture' in cell_val and 'topic' in cell_val):
                    lec_cell = (roww,coll)
                    break
            if(lec_cell[0]!=-1):
                break
        if(lec_cell[0] == -1):
            return "-1"
        plan = ""
        for i in range(8):
            new_row = lec_cell[0]+i+1
            new_col = lec_cell[1]
            cell_val = str(worksheet.cell(row=new_row, column=new_col).value).lower().strip()
            plan = plan + ". " + cell_val
        plan = self.pre_process_text(plan)
        plan = ". ".join(plan.split("•"))
        plan = plan + "."
        return plan
    
    def compute_embedding(self):
        return embed([self.description])
   
try:                                              #Loading Course_Loader object if it exists
    #print(1/0)
    with open("Course_Loader_Save", "rb") as f:
        c = pickle.load(f)
except:                                           #Creating a new object if it doesn't exist and saving it on the disk

    path = os.getcwd() # Getting the current directory to access the data files
    path = str(path) + "\\Data\\" # Data must be stored inside the Data folder located in the current directory 

    csv_files = glob.glob(os.path.join(path, "*.xlsx")) # Get names of all csv files
    
    c = Course_Loader()

    for f in csv_files:  #Adding files to the course loader object 
        c.addCourse(f)

    with open("Course_Loader_Save", "wb") as f:   #Saving the new object on the disk 
        pickle.dump(c,f)

query = ["operating systems","computer networks"]
c.combine_recommendations(query) #Making a query

Unnamed: 0,Course Name,Course Code,Course Description
288,operatingsystems,cse231,Operating system is the interface between the ...
46,operatingsystem,cse231,Operating system is the interface between the ...
48,computerarchitectureandoperatingsystems,cse234,The overall objective of this course is to pro...
293,programmablenetworking,cse567,There has been a massive revolution in network...
47,networkadministration,cse233,This course is intended for second year B.Tech...
184,advancedembeddedlogicdesign,ece573,Embedded systems consisting of multi-core proc...
72,distributedsystems:concepts&design,cse530,This objective of this course is to train stud...
75,wirelessnetworks,ece538/cse538,This course will cover a variety of mobile sys...
33,computerorganization,cse112,This course considers a computer as a stand al...


In [1]:
from tkinter import *
from PIL import ImageTk,Image
from tkinter import messagebox

def handle_login():
    email = email_input.get()
    password = password_input.get()

    courses = email.split()
    print(1)
    c.combine_recommendations(courses)
    #print()

root = Tk()

root.title('Login Form')
#root.iconbitmap('favicon.ico')

root.geometry('500x500')

root.configure(background='#0096DC')
img = Image.open('background.jpg')
resized_img = img.resize((70,70))
img = ImageTk.PhotoImage(resized_img)

img_label = Label(root,image=img)
img_label.pack(pady=(10,10))

text_label = Label(root,text='Course Similarity Evaluator',fg='white',bg='#0096DC')
text_label.pack()
text_label.config(font=('verdana',24))

email_label = Label(root,text='Enter The Courses(Comma Separated):',fg='white',bg='#0096DC')
email_label.pack(pady=(20,5))
email_label.config(font=('verdana',12))

email_input = Entry(root,width=50)
email_input.pack(ipady=6,pady=(1,15))

password_label = Label(root,text='Enter Password',fg='white',bg='#0096DC')
password_label.pack(pady=(20,5))
password_label.config(font=('verdana',12))

password_input = Entry(root,width=50)
password_input.pack(ipady=6,pady=(1,15))

login_btn = Button(root,text='Login Here',bg='white',fg='black',width=20,height=2,command=handle_login)
login_btn.pack(pady=(10,20))
login_btn.config(font=('verdana',10))

root.mainloop()

1


Exception in Tkinter callback
Traceback (most recent call last):
  File "c:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_7164\3558949399.py", line 11, in handle_login
    c.combine_recommendations(courses)
NameError: name 'c' is not defined


In [1]:
import pandas as pd
import os
import glob
import openpyxl
import warnings
import re
import tensorflow_hub as hub
import difflib
import numpy as np
import math
from sklearn.metrics.pairwise import linear_kernel
import pickle
from tkinter import *
from PIL import ImageTk,Image
from tkinter import messagebox


warnings.simplefilter(action='ignore', category=UserWarning)

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

class Course_Loader:
    course_names = {}
    course_codes = {}
    names_list = []
    codes_list = []
    df = -1
    newEntries = False
    cosine_sim = -1
    indices = -1

    def __init__(self):
        data = ['Course Name','Course Code','Course Description']
        self.df = pd.DataFrame(columns = data)
        self.course_codes = {}
        self.course_names = {}
        self.names_list = []
        self.codes_list = []
        self.newEntries = False

    def addCourse(self, f):
        newCourse = Course(f)
        self.course_names[newCourse.name] = newCourse
        self.course_codes[newCourse.code] = newCourse
        new_df = pd.DataFrame({"Course Name":[newCourse.name],"Course Code":[newCourse.code], "Course Description":[newCourse.description]})
        self.df = self.df.append(new_df,ignore_index=True)
        self.newEntries = True # Keeps track of if new entries were added after calculating the similarity check 

    def findbyCode(self,code, exact = False):
        ans = []
        if(not exact):
            code = "".join(code.lower().split())
            out = difflib.get_close_matches(code, self.codes_list, cutoff= 0.0000001, n = 10)
            for i in out:
                ans.append(self.course_codes[i])
        if(code in self.course_codes.keys()):
            ans.append(self.course_codes[code])
        return ans
    
    def findbyName(self,name, exact = False):
        ans = []
        if(not exact):
            name = "".join(name.lower().split())
            out = difflib.get_close_matches(name, self.names_list, cutoff= 0.0000001, n = 10)
            for i in out:
                ans.append(self.course_names[i])
        if(name in self.course_names.keys()):
            ans.append(self.course_names[name])
        return ans

    def initialize_recommender(self):
        if(self.newEntries):
            self.newEntries = False
            self.df = self.df.dropna()
            self.df = self.df.drop_duplicates(subset='Course Name', keep='first')
            self.df = self.df.reset_index(drop=True)
            self.names_list = self.df['Course Name'].unique()
            self.codes_list = self.df['Course Code'].unique()
            encodings = embed(self.df['Course Description'])
            matrix = np.vstack(encodings)
            cosine_sim = linear_kernel(matrix, matrix)
            indices = pd.Series(self.df.index, index=self.df['Course Name'])
            self.cosine_sim = cosine_sim
            self.indices = indices

    def get_recommendations(self, title):
        idx = self.indices[title]
        sim_scores = list(enumerate(self.cosine_sim[idx]))
        return sim_scores
    
    def combine_recommendations(self,titles):
        out = []
        self.initialize_recommender()
        for i in range(len(self.df)):
            out.append([i,0])
        for org_title in titles:            #### Names may or may not be present in the dataset, finds the closest title matching in the dataset
            title = difflib.get_close_matches(org_title, self.names_list, cutoff= 0.0000001, n = 1)[0]  #Searching the closest match in the dataset 
            cur_out = self.get_recommendations(title)
            for score in range(len(cur_out)):
                out[score][1] += cur_out[score][1]
        final_score = sorted(out, key=lambda x: x[1], reverse=True)
        df_indices = [i[0] for i in final_score][1:10]
        result = self.df.iloc[df_indices]
        return result
                     
class Course:
    course_name_list = set([])
    file_name = ""
    code = ""
    description = ""
    embedding = []
    similarity_matrix = []
    file_ptr = ""
    name = ""

    def __init__(self, file_pointer):
        self.file_name = str(f)
        self.file_ptr = file_pointer
        self.description = self.find_description()
        self.code = self.find_code()
        self.name = self.find_name()
        if(self.description != -1):
            self.embedding = self.compute_embedding()

    def find_description(self):
        workbook = openpyxl.load_workbook(self.file_ptr)
        worksheet = workbook.active
        description_row = -1
        description_col = -1
        for coll in range(1,6):
            for roww in range(1,20):
                if("description" in "".join(re.sub(r'[^\w\s]','', re.sub(r'\d+','',str(worksheet.cell(row=roww, column=coll).value).lower())).split())):
                    description_row = roww
                    description_col = coll
                    return str(worksheet.cell(row=roww, column=coll+1).value)
        return "-1"
    
    def find_code(self):
        f = self.file_ptr
        workbook = openpyxl.load_workbook(f)
        worksheet = workbook.active
        for coll in range(1,4):
            for roww in range(1,5):
                cell_val = "".join("".join(str(worksheet.cell(row=roww, column=coll).value).lower().strip().split()).split("-"))
                if('code' in cell_val):
                    cell_val = " ".join("".join(str(worksheet.cell(row=roww, column=coll+1).value).lower().strip().split()).split("-"))
                    return cell_val
        #If not found in the sheet, find in file_name 
        file_name = str(f).split("\\")[-1]
        file_name = file_name.split("/")[-1]
        file_name = file_name.split(".")
        if(len(file_name) >= 2):
            file_name = file_name[-2]
        file_name = "".join(file_name.lower().split())
        return file_name 
    
    def find_name(self):
        f = self.file_ptr
        workbook = openpyxl.load_workbook(f)
        worksheet = workbook.active
        for coll in range(1,4):
            for roww in range(1,5):
                cell_val = "".join("".join(str(worksheet.cell(row=roww, column=coll).value).lower().strip().split()).split("-"))
                if('name' in cell_val):
                    cell_val = " ".join("".join(str(worksheet.cell(row=roww, column=coll+1).value).lower().strip().split()).split("-"))
                    return cell_val
        #If not found in the sheet, find in file_name 
        file_name = str(f).split("\\")[-1]
        file_name = file_name.split("/")[-1]
        file_name = file_name.split(".")
        if(len(file_name) >= 2):
            file_name = file_name[-2]
        file_name = "".join(file_name.lower().split())
        return file_name

    def compute_embedding(self):
        return embed([self.description])
   
try:                                              #Loading Course_Loader object if it exists
    with open("Course_Loader_Save", "rb") as f:
        c = pickle.load(f)
except:                                           #Creating a new object if it doesn't exist and saving it on the disk

    path = os.getcwd() # Getting the current directory to access the data files
    path = str(path) + "\\Data\\" # Data must be stored inside the Data folder located in the current directory 

    csv_files = glob.glob(os.path.join(path, "*.xlsx")) # Get names of all csv files
    
    c = Course_Loader()

    for f in csv_files:  #Adding files to the course loader object 
        c.addCourse(f)

    with open("Course_Loader_Save", "wb") as f:   #Saving the new object on the disk 
        pickle.dump(c,f)

query = ["operating systems","computer networks"]
c.combine_recommendations(query)    #Making a query

def handle_login():
    email = email_input.get()
    password = password_input.get()

    courses = email.split(",")
    #print(1)
    print(c.combine_recommendations(courses))
    #print()

root = Tk()

root.title('Login Form')
#root.iconbitmap('favicon.ico')

root.geometry('500x500')

root.configure(background='#0096DC')
img = Image.open('background.jpg')
resized_img = img.resize((70,70))
img = ImageTk.PhotoImage(resized_img)

img_label = Label(root,image=img)
img_label.pack(pady=(10,10))

text_label = Label(root,text='Course Similarity Evaluator',fg='white',bg='#0096DC')
text_label.pack()
text_label.config(font=('verdana',24))

email_label = Label(root,text='Enter The Courses(Comma Separated):',fg='white',bg='#0096DC')
email_label.pack(pady=(20,5))
email_label.config(font=('verdana',12))

email_input = Entry(root,width=50)
email_input.pack(ipady=6,pady=(1,15))

password_label = Label(root,text='Enter Password',fg='white',bg='#0096DC')
password_label.pack(pady=(20,5))
password_label.config(font=('verdana',12))

password_input = Entry(root,width=50)
password_input.pack(ipady=6,pady=(1,15))

login_btn = Button(root,text='Login Here',bg='white',fg='black',width=20,height=2,command=handle_login)
login_btn.pack(pady=(10,20))
login_btn.config(font=('verdana',10))

root.mainloop()

                                 Course Name            Course Code  \
46                           operatingsystem                 cse231   
39                 introductiontoprogramming                 cse101   
228                introductionofprogramming                 cse101   
0                        advancedprogramming                 cse201   
48   computerarchitectureandoperatingsystems                 cse234   
287       objectorientedprogramminganddesign        cse600a/ece600a   
56                            computervision  cse344/544,ece344/544   
44   fundamentalsofdatabasemanagementsystems                 cse202   
4                algorithmdesign&analysis(b)                 cse223   

                                    Course Description  
46   Operating system is the interface between the ...  
39   Introduction of Programming is the first progr...  
228  Introduction of Programming is the first progr...  
0    The Advanced Programming is a successor to the...  
48  

In [6]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def generate_similarity_explanation(course1_title, course1_description, course2_title, course2_description):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    prompt = f"Explain why the following two courses are similar:\n\nCourse 1: {course1_title}\n{course1_description}\n\nCourse 2: {course2_title}\n{course2_description}\n\nExplanation: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    explanation = tokenizer.decode(output[0], skip_special_tokens=True)

    return explanation

# Example course descriptions
course1_title = "Introduction to Data Science"
course1_description = "This is a course about science"

course2_title = "Data Science Foundations"
course2_description = "This is a course about maths"

explanation = generate_similarity_explanation(course1_title, course1_description, course2_title, course2_description)
f = open("exp.txt","w+")
f.write(explanation)
f.close()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [8]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

def extract_sentences(course_description):
    sentences = course_description.split(".")
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def get_sentence_embeddings(sentences):
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embeddings = []
    for sentence in sentences:
        input_ids = tokenizer.encode(sentence, return_tensors="pt")
        with torch.no_grad():
            output = model(input_ids)
        embeddings.append(output[0][:, 0, :].numpy())

    return np.vstack(embeddings)

def compare_courses(course1_description, course2_description):
    course1_sentences = extract_sentences(course1_description)
    course2_sentences = extract_sentences(course2_description)

    course1_embeddings = get_sentence_embeddings(course1_sentences)
    course2_embeddings = get_sentence_embeddings(course2_sentences)

    similarity_matrix = cosine_similarity(course1_embeddings, course2_embeddings)

    most_similar_indices = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
    most_dissimilar_indices = np.unravel_index(np.argmin(similarity_matrix), similarity_matrix.shape)

    return {
        "similar": (course1_sentences[most_similar_indices[0]], course2_sentences[most_similar_indices[1]]),
        "dissimilar": (course1_sentences[most_dissimilar_indices[0]], course2_sentences[most_dissimilar_indices[1]])
    }

# Example course descriptions
course1_description = "This course covers the fundamentals of data science, including data manipulation, visualization, and basic statistical analysis. Students will learn how to work with various data formats and use Python libraries such as Pandas and Matplotlib."
course2_description = "In this course, students will explore the core concepts of data science, including data cleaning, visualization, and basic statistical techniques. The course will focus on using Python and popular libraries like Pandas and Matplotlib to analyze and visualize data."

comparison = compare_courses(course1_description, course2_description)
print("Similar points:")
print(comparison["similar"])
print("\nDissimilar points:")
print(comparison["dissimilar"])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

Similar points:
('This course covers the fundamentals of data science, including data manipulation, visualization, and basic statistical analysis', 'In this course, students will explore the core concepts of data science, including data cleaning, visualization, and basic statistical techniques')

Dissimilar points:
('This course covers the fundamentals of data science, including data manipulation, visualization, and basic statistical analysis', 'The course will focus on using Python and popular libraries like Pandas and Matplotlib to analyze and visualize data')


In [2]:
import pandas as pd
import sys 
# Load your dataset
data = c.df

# Combine course names and descriptions
data["text"] = data["Course Name"] + ": " + data["Course Description"]
# Save the text data to a file
with open("training_data.txt", "w+", encoding='utf-8') as f:
    for text in data["text"]:
        f.write(text + "\n")

In [3]:
from datasets import load_dataset
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

# Load the dataset
dataset = load_dataset("text", data_files={"train": "training_data.txt"})
train_dataset = dataset["train"]

# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=128), batched=True)

# Load the GPT-Neo model
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


Downloading and preparing dataset text/default to C:/Users/Administrator/.cache/huggingface/datasets/text/default-c586b4e8b33a86b8/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1800.13it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 139.15it/s]
                                                        

Dataset text downloaded and prepared to C:/Users/Administrator/.cache/huggingface/datasets/text/default-c586b4e8b33a86b8/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 93.68it/s]
Using pad_token, but it is not set yet.
  0%|          | 0/429 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [9]:
from transformers import GPT2Tokenizer, GPTNeoForCausalLM

def generate_similarity_explanation(course1_description, course2_description):
    model_name = "output"  # The output directory from the fine-tuning step
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPTNeoForCausalLM.from_pretrained(model_name)

    prompt = f"Explain why the following two courses are similar based on their descriptions:\n\nCourse 1: {course1_description}\n\nCourse 2: {course2_description}\n\nExplanation: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    explanation = tokenizer.decode(output[0], skip_special_tokens=True)

    return explanation

# Example course descriptions
course1_description = "This course covers the fundamentals of data science, including data manipulation, visualization, and basic statistical analysis. Students will learn how to work with various data formats and use Python libraries such as Pandas and Matplotlib."
course2_description = "In this course, students will explore the core concepts of data science, including data cleaning, visualization, and basic statistical techniques. The course will focus on using Python and popular libraries like Pandas and Matplotlib to analyze and visualize data."

explanation = generate_similarity_explanation(course1_description, course2_description)
print(explanation)


OSError: output is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.