In [7]:
# import required packages

import pandas as pd
import numpy as np
import nltk
import re
from nltk import corpus
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import wsd
from nltk.corpus import wordnet as wn
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('wordnet2022')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to /root/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!


True

In [2]:
blog_df = pd.read_csv('https://raw.githubusercontent.com/BhavanishDhamnaskar/blog_d/main/Medium%20Blog%20Data.csv')
author_df = pd.read_csv('https://raw.githubusercontent.com/BhavanishDhamnaskar/blog_d/main/Author%20Data.csv')
ratings_df = pd.read_csv('https://raw.githubusercontent.com/BhavanishDhamnaskar/blog_d/main/Blog%20Ratings.csv')

In [3]:
#Content based filtering
blog_df['topic'].value_counts()

ai                      736
blockchain              644
cybersecurity           642
web-development         635
data-analysis           594
cloud-computing         589
security                527
web3                    471
machine-learning        467
nlp                     453
data-science            444
deep-learning           430
android                 426
dev-ops                 384
information-security    374
image-processing        354
flutter                 343
backend                 341
cloud-services          339
Cryptocurrency          331
app-development         322
backend-development     312
Software-Development    309
Name: topic, dtype: int64

In [4]:
blog_df.drop(['author_id','blog_link','blog_img','scrape_time'],axis='columns',inplace=True)

In [5]:
#To remove duplicate blog data
blog_df.drop_duplicates(['blog_title','blog_content'],inplace=True)

In [9]:
#Preprocessing Text Data
lst_stopwords=corpus.stopwords.words('english')
def pre_process_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    text=str(text).lower()
    text=text.strip()
    text = re.sub(r'[^\w\s]', '', text)
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text=[word for word in lst_text if word not in lst_stopwords]
    if flg_lemm:
        lemmatizer = WordNetLemmatizer()
        lst_text = [lemmatizer.lemmatize(word) for word in lst_text]
    if flg_stemm:
        stemmer = PorterStemmer()
        lst_text = [stemmer.stem(word) for word in lst_text]
    text=" ".join(lst_text)
    return text

In [10]:
blog_df['clean_blog_content'] = blog_df['blog_content'].apply(lambda x: pre_process_text(x,flg_stemm=False,flg_lemm=True,lst_stopwords=lst_stopwords))

In [11]:
#TFIDF
tfidf_vecotorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vecotorizer.fit_transform(blog_df['clean_blog_content'])
print(tfidf_matrix.shape)

(10466, 25167)


In [12]:
cosine_sim = cosine_similarity(tfidf_matrix)
print(cosine_sim)

[[1.         0.         0.         ... 0.02173711 0.         0.        ]
 [0.         1.         0.         ... 0.00452585 0.00905365 0.00985712]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.02173711 0.00452585 0.         ... 1.         0.         0.        ]
 [0.         0.00905365 0.         ... 0.         1.         0.03097127]
 [0.         0.00985712 0.         ... 0.         0.03097127 1.        ]]


In [13]:
# Let us have the blogs rated by user with user id 12
user_rating = ratings_df[ratings_df['userId']==12]

# consider blogs with ratings greater than or equal to 3.5 just for simplification
blogs_to_consider = user_rating[user_rating['ratings']>=3.5]['blog_id']

# Now we need Id's of this blogs in form of a list
high_rated_blogs = blogs_to_consider.values

In [14]:
rated_blogs = blog_df[blog_df['blog_id'].isin(high_rated_blogs)]
rated_blogs

Unnamed: 0,blog_id,blog_title,blog_content,topic,clean_blog_content
198,217,Stream Builder in Flutter,How to use StreamBuilder in Flutter? Flutter i...,flutter,use streambuilder flutter flutter popular open...
1301,1328,April 1st Recommendation on Alignment,"I want to make a basic assumption here, becaus...",ai,want make basic assumption date youre truly ra...
1377,1404,How ServiceNow users can use AI and what AI se...,ServiceNow is a cloud computing company that p...,ai,servicenow cloud computing company provides so...
3375,3402,Realtime Object detection using TensorFlow in ...,Real-time object detection using TensorFlow ca...,ai,realtime object detection using tensorflow ach...
3432,3459,2 ChatGPT (Free) Chrome Extensions so Useful T...,"Save hours on writing emails, googling, learni...",ai,save hour writing email googling learning unle...
8696,8723,Testing and Debugging in Flutter,Testing in Flutter: Testing is crucial for any...,flutter,testing flutter testing crucial app developmen...
8719,8746,Exploring Flutter Stream Builder: A Beginner’s...,"Hello there, little friend! Do you want to lea...",flutter,hello little friend want learn something calle...
8720,8747,Getting Started with Augmented Reality Mobile ...,A Comprehensive Guide to Learning and Building...,flutter,comprehensive guide learning building immersiv...
8722,8749,Mastering Bloc with ‘GetCubit’ in Flutter,"When you’re working with Flutter, you might wa...",flutter,youre working flutter might want manage state ...
8730,8757,Flutter State Management: An In-Depth Explorat...,Learn how to efficiently manage state in your ...,flutter,learn efficiently manage state flutter applica...


In [15]:
def get_similar_blog(high_rated_blogs):
    """
        Args:
            high_rated_blogs : list of blog id's of the blogs rated by the user
        Returns:
            recommended_blogs : list of blog id's of the blogs that are to be recommended
    """

    recommended_blogs = []

    for blog_id in high_rated_blogs:

        # Find out the index value of particular blog
        temp_id = blog_df[blog_df['blog_id'] == blog_id].index.values[0]

        # Find out the index value of all the blogs which have similarity greater than 0.95
        temp_blog_id = blog_df[cosine_sim[temp_id] > 0.95]['blog_id'].index.values

        # Check whether the blog is already recommended or not and also verify that it is not seen by user previously
        for b_id in temp_blog_id:
            if b_id not in recommended_blogs and b_id not in high_rated_blogs:
                recommended_blogs.append(b_id)

    return recommended_blogs

# Generating Recommendation

In [16]:
recommended_blogs=get_similar_blog(high_rated_blogs)

In [17]:
blog_df.iloc[recommended_blogs]

Unnamed: 0,blog_id,blog_title,blog_content,topic,clean_blog_content
3377,3404,Here are the 3 ideas that you can use for Noti...,Leverage Notion using AI. — 3 ideas for Notio...,ai,leverage notion using ai 3 idea notionaiwwwins...
3434,3461,"Reading Herculaneum Scrolls: $250,000 Challeng...",Scientists have announced a contest with a pri...,ai,scientist announced contest prize quarter mill...
1379,1406,Unbabel — The AI-powered Translation Solution ...,"In today’s global economy, language barriers c...",ai,today global economy language barrier major ob...
1303,1330,"ChatGPT tweets, and it’s painful",OpenAI’s ChaptGPT engine has taken the world b...,ai,openais chaptgpt engine taken world storm beco...
8799,8826,10 Widgets Every Flutter Developer Must Master,A Comprehensive Guide to Flutter Widgets for B...,flutter,comprehensive guide flutter widget building am...
8907,8934,Data Persistence in Flutter,Data persistence is an essential aspect of any...,flutter,data persistence essential aspect mobile appli...
8698,8725,The Ultimate Flutter Navigator 2.0 series usin...,"In the first part, you learned about how you c...",flutter,first part learned set auto_route package flut...
8871,8898,Google Pay: A success story of Flutter,Google Pay is a popular digital wallet and onl...,flutter,google pay popular digital wallet online payme...
8836,8863,Flutter vs React Native: Which One is Better?,"When it comes to mobile app development, there...",flutter,come mobile app development two major player m...
200,219,Building beautiful product item widget in Flut...,Product item widgets are a fundamental aspect ...,flutter,product item widget fundamental aspect ecommer...
