In [37]:
from pymongo import MongoClient
import pandas as pd

In [38]:
#connect to mongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client.learning
collection = db.courses

In [39]:
#Fetch data
data = list(collection.find({}, {"_id": 0}))
df = pd.DataFrame(data)

#save to CSV
df.to_csv("courses.csv", index=False)

print(df.head())

              category             title        tutor  duration  \
0             Business  Business Related  Kandy Tutor         3   
1           Technology           sdsdsfd        sdsdv         3   
2       Art and Design           Drawing        Royal         3   
3  Health and Wellness             mafas         mafa         5   

            description                    image                    video  \
0  Hi hellow i am mafas  image-1736362810912.png  video-1736362810915.mp4   
1                 dsvsv  image-1736416029981.png  video-1736416029983.mp4   
2             ajkabskja  image-1736441730633.png  video-1736441730781.mp4   
3                sjankx  image-1736448446219.png  video-1736448446222.mp4   

                    tutorId               createdAt               updatedAt  \
0  677ecaecd60271cc5e433285 2025-01-08 19:00:10.952 2025-01-08 19:00:10.952   
1  677ecaecd60271cc5e433285 2025-01-09 09:25:35.797 2025-01-09 09:47:10.003   
2  677fff4a4b252bc2198f4b95 2025-01-09 16

In [40]:
df = pd.read_csv("courses.csv")

In [41]:
#Drop Irrelevent coulmns
columns_to_drop = ['image','video','createdAt','updatedAt']

df.drop(columns=columns_to_drop, inplace=True)

In [42]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v
0,Business,Business Related,Kandy Tutor,3,Hi hellow i am mafas,677ecaecd60271cc5e433285,0
1,Technology,sdsdsfd,sdsdv,3,dsvsv,677ecaecd60271cc5e433285,0
2,Art and Design,Drawing,Royal,3,ajkabskja,677fff4a4b252bc2198f4b95,0
3,Health and Wellness,mafas,mafa,5,sjankx,677fff4a4b252bc2198f4b95,0


In [43]:
#Normalized Text Columns

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

In [44]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v
0,Business,Business Related,Kandy Tutor,3,Hi hellow i am mafas,677ecaecd60271cc5e433285,0
1,Technology,sdsdsfd,sdsdv,3,dsvsv,677ecaecd60271cc5e433285,0
2,Art and Design,Drawing,Royal,3,ajkabskja,677fff4a4b252bc2198f4b95,0
3,Health and Wellness,mafas,mafa,5,sjankx,677fff4a4b252bc2198f4b95,0


In [45]:
text_columns = ['category', 'title', 'tutor', 'description']
for col in text_columns:
    df[col] = df[col].apply(clean_text)

print(df)

              category             title        tutor  duration  \
0             business  business related  kandy tutor         3   
1           technology           sdsdsfd        sdsdv         3   
2       art and design           drawing        royal         3   
3  health and wellness             mafas         mafa         5   

            description                   tutorId  __v  
0  hi hellow i am mafas  677ecaecd60271cc5e433285    0  
1                 dsvsv  677ecaecd60271cc5e433285    0  
2             ajkabskja  677fff4a4b252bc2198f4b95    0  
3                sjankx  677fff4a4b252bc2198f4b95    0  


In [46]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v
0,business,business related,kandy tutor,3,hi hellow i am mafas,677ecaecd60271cc5e433285,0
1,technology,sdsdsfd,sdsdv,3,dsvsv,677ecaecd60271cc5e433285,0
2,art and design,drawing,royal,3,ajkabskja,677fff4a4b252bc2198f4b95,0
3,health and wellness,mafas,mafa,5,sjankx,677fff4a4b252bc2198f4b95,0


In [47]:
#Handle missing values
df.fillna('', inplace=True)

In [48]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v
0,business,business related,kandy tutor,3,hi hellow i am mafas,677ecaecd60271cc5e433285,0
1,technology,sdsdsfd,sdsdv,3,dsvsv,677ecaecd60271cc5e433285,0
2,art and design,drawing,royal,3,ajkabskja,677fff4a4b252bc2198f4b95,0
3,health and wellness,mafas,mafa,5,sjankx,677fff4a4b252bc2198f4b95,0


In [49]:
#Ensure numeric columns
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')  # Convert to numeric
df['duration'] = df['duration'].fillna(0)  # Fill missing durations with 0

In [50]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v
0,business,business related,kandy tutor,3,hi hellow i am mafas,677ecaecd60271cc5e433285,0
1,technology,sdsdsfd,sdsdv,3,dsvsv,677ecaecd60271cc5e433285,0
2,art and design,drawing,royal,3,ajkabskja,677fff4a4b252bc2198f4b95,0
3,health and wellness,mafas,mafa,5,sjankx,677fff4a4b252bc2198f4b95,0


In [51]:
print(df.dtypes)

category       object
title          object
tutor          object
duration        int64
description    object
tutorId        object
__v             int64
dtype: object


In [52]:
#Encode tutorID

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['tutorId'] = encoder.fit_transform(df['tutorId'])

In [53]:
df['combined_text'] = df['category'] + '' + df['title'] + '' + df['description']

In [54]:
df

Unnamed: 0,category,title,tutor,duration,description,tutorId,__v,combined_text
0,business,business related,kandy tutor,3,hi hellow i am mafas,0,0,businessbusiness relatedhi hellow i am mafas
1,technology,sdsdsfd,sdsdv,3,dsvsv,0,0,technologysdsdsfddsvsv
2,art and design,drawing,royal,3,ajkabskja,1,0,art and designdrawingajkabskja
3,health and wellness,mafas,mafa,5,sjankx,1,0,health and wellnessmafassjankx


In [55]:
df.to_csv("cleaned_courses.csv", index=False)

In [56]:
#Recomendation

In [57]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [58]:
#Build the recomendation system   Content Based filtering
#convert text features into numerical representation (e.g. TF-IDF) 

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

#Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [60]:
def get_recommendations(title, df, cosine_sim):
    if title not in df['title'].values:
        return f"Title '{title}' not found in the dataset."
    
    idx = df[df['title'] == title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:6]
    course_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[course_indices]

In [63]:
recommendations = get_recommendations('mafas', df, cosine_sim)

if isinstance(recommendations, str):  
    print(recommendations)
else:
    print("Recommended Courses:")
    print(recommendations)

Recommended Courses:
0    business related
1             sdsdsfd
2             drawing
Name: title, dtype: object
