In [1]:
import pandas as pd

# Load your dataset (example with a CSV file)
#df = pd.read_csv("courses1.csv", encoding='utf-8', errors='ignore')  # Skips invalid characters
df = pd.read_csv(r"C:\Users\deepi\analytics vidhya\courses1.csv",encoding='ISO-8859-1')

# Example of dataset structure
print(df.head())


                                               Title  \
0  Improving Real World RAG Systems: Key Challeng...   
1  Framework to Choose the Right LLM for your Bus...   
2                      Generative AI - A Way of Life   
3  Building LLM Applications using Prompt Enginee...   
4   Bagging and Boosting ML Algorithms - Free Course   

                                         Description  \
0  This course explores the key challenges in bui...   
1  This course will guide you through the process...   
2  This course is a transformative journey tailor...   
3  This course will provide you with a hands-on u...   
4  This course will provide you with a hands-on u...   

                                          Curriculum  
0  Improving Real World RAG System\nIntroduction ...  
1  Introduction\nIntroduction\n2\nIt's an LLM Wor...  
2  Introduction to Generative AI\nFundamentals of...  
3  How to build diffferent LLM AppIications?\nInt...  
4  Bagging\nResources to be used in this course\n..

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Example function to create the input prompt
def create_prompt(title, description, curriculum):
    # Define the prompt structure
    prompt = f"Course Title: {title}\nDescription: {description}\nCurriculum: {curriculum}\n"
    prompt += "Summarize this course or recommend it based on the description."
    return prompt

# Dataset class for handling course data
class CourseDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        title = self.df.iloc[idx]['Title']
        description = self.df.iloc[idx]['Description']
        curriculum = self.df.iloc[idx]['Curriculum']
        
        # Generate prompt
        input_text = create_prompt(title, description, curriculum)
        
        # Tokenize input and output
        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        # Set the target output as a summary or recommendation task
        labels = self.tokenizer.encode_plus(
            "Summarize this course",
            max_length=100,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

# Initialize Dataset and DataLoader
dataset = CourseDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Define training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backpropagation
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

print("Training Complete!")


Epoch 1, Loss: 28.3580379486084
Epoch 1, Loss: 28.02680015563965
Epoch 1, Loss: 28.00735855102539
Epoch 2, Loss: 24.201677322387695
Epoch 2, Loss: 22.90985107421875
Epoch 2, Loss: 20.708946228027344
Epoch 3, Loss: 17.513511657714844
Epoch 3, Loss: 18.393674850463867
Epoch 3, Loss: 17.59160804748535
Training Complete!


In [4]:
# Test the fine-tuned model with a new prompt
def generate_course_summary(model, tokenizer, title, description, curriculum, max_length=100):
    model.eval()
    input_text = create_prompt(title, description, curriculum)
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    input_ids = input_ids.to(device)
    
    # Generate output (summary or recommendation)
    summary_ids = model.generate(input_ids=input_ids, max_length=max_length, num_beams=2, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

# Example usage
title = "Introduction to Deep Learning"
description = "This course covers the basics of neural networks and deep learning."
curriculum = "Week 1: Neural Networks, Week 2: Backpropagation, Week 3: CNNs"

summary = generate_course_summary(model, tokenizer, title, description, curriculum)
print(f"Generated Summary: {summary}")


Generated Summary: Title: Introduction to Deep Learning Description: This course covers basic neural networks and deep learning. Curriculum: Week 1: Neural Networks, Week 2: Backpropagation, Week 3: CNNs Summarize this course or recommend it based on the description.


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assume you have a dataframe `courses_df` with your courses data
# Sample structure of courses_df:
# courses_df = pd.DataFrame({
#     'title': ["Deep Learning", "Machine Learning", "Data Science"],
#     'description': ["Learn deep learning...", "Understand machine learning...", "Introduction to data science..."],
#     'curriculum': ["Curriculum for deep learning...", "Curriculum for machine learning...", "Curriculum for data science..."]
# })

def recommend_courses(search_query, courses_df, top_n=3):
    # Combine course titles and descriptions for vectorization
    courses_df['combined'] = courses_df['title'] + " " + courses_df['description']

    # Vectorize the search query and the courses
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(courses_df['combined'].tolist() + [search_query])

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Get the indices of the top_n most similar courses
    similar_indices = cosine_sim[0].argsort()[-top_n:][::-1]
    
    # Return recommended courses
    recommendations = courses_df.iloc[similar_indices]
    return recommendations[['title', 'description']]

# Example usage
search_query = "deep learning and neural networks"
recommended_courses = recommend_courses(search_query, courses_df)

print("Recommended Courses:")
print(recommended_courses)


NameError: name 'courses_df' is not defined

In [5]:
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load dataset (example with a CSV file containing course data)
df = pd.read_csv('courses1.csv')  # Your dataset with course title, description, and curriculum

# Function to create a search input (concatenate title, description, and curriculum)
def create_prompt(title, description, curriculum):
    return f"{title} {description} {curriculum}"

# Search for the most relevant course
def search_courses(df, query, top_n=5):
    # Combine course information into a single text field for TF-IDF search
    df['combined'] = df['title'] + ' ' + df['description'] + ' ' + df['curriculum']
    
    # Vectorize the combined course text using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined'])
    
    # Vectorize the query using the same vectorizer
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity between the query and all courses
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top N related courses based on similarity
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    recommendations = df.iloc[top_indices]
    
    return recommendations[['title', 'description', 'curriculum']]

# Generate a course recommendation
def generate_course_recommendation(query, df, top_n=5):
    recommendations = search_courses(df, query, top_n)
    return recommendations

# Example usage
query = "Neural Networks and Deep Learning"
recommendations = generate_course_recommendation(query, df)

print(f"Top {len(recommendations)} recommended courses for '{query}':\n")
print(recommendations)


ModuleNotFoundError: No module named 'sklearn'