In [1]:
import numpy as np
import pandas as pd

In [2]:
data1 = pd.read_csv("data.csv")

In [3]:
data2=data1.dropna(subset=['URL'])
data3 = data2.dropna(subset=['Content'])

In [4]:
from urllib.parse import urlparse
def is_valid_url(URL):
    try:
        result = urlparse(URL)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

valid_url_mask = data3['URL'].apply(is_valid_url)

In [5]:
data4 = data3[valid_url_mask]
data5= data3[valid_url_mask]

In [6]:
def replace_nan_with_string(df, column_name, replacement_string='abc'):
  
    df[column_name] = df[column_name].fillna(replacement_string)
    return df

data6 = replace_nan_with_string(data4, 'Heading')
data6= replace_nan_with_string(data4, 'Content')

In [7]:
from bs4 import BeautifulSoup
import re
def clean_text(text):
  
    text = BeautifulSoup(text, 'html.parser').get_text()
   
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join(text.split())

    return text

data6[['Content', 'Heading']] = data6[['Content', 'Heading']].applymap(clean_text)

data6.head(30)

Unnamed: 0,URL,Heading,Content
0,https://www.rbi.org.in/hindi/Home.aspx,abc,
3,https://cms.rbi.org.in,abc,Never respond to Request money options from un...
4,https://www.rbi.org.in/scripts/MANI.aspx,Press Release Public URLs for MANI app Radio J...,Android app is available publicly on httpsplay...
5,https://Rbikehtahai.rbi.org.in,abc,Safe digital banking starts with you Transact ...
6,https://therbimuseum.rbi.org.in,here Accessibility,Reserve a Tour Manage Reservation Cancel Reser...
7,https://fintech.rbi.org.in,Important,India is on the way to becoming Asias top fina...
10,https://www.rbi.org.in/Scripts/BS_PressRelease...,Date Supersession of Board of Directors of Abh...,In exercise of the powers conferred under Sect...
11,https://www.rbi.org.in/Scripts/BS_PressRelease...,Date RBI updates the Alert List of unauthorize...,The Reserve Bank of India RBI has added the fo...
12,https://www.rbi.org.in/Scripts/BS_SpeechesView...,Date Changing Paradigms in the Financial Lands...,Distinguished guests Good evening First of all...
13,https://www.rbi.org.in/Scripts/BS_SpeechesView...,Date Winning in Uncertain Times The Indian Exp...,I am thankful to FICCI and IBA for once again ...


In [8]:
result_df = pd.DataFrame({
    'URL': data6['URL'],
    'RowNumber': data6.index,
    'MergedContent': data6['Heading']
})
result_df.head()

Unnamed: 0,URL,RowNumber,MergedContent
0,https://www.rbi.org.in/hindi/Home.aspx,0,abc
3,https://cms.rbi.org.in,3,abc
4,https://www.rbi.org.in/scripts/MANI.aspx,4,Press Release Public URLs for MANI app Radio J...
5,https://Rbikehtahai.rbi.org.in,5,abc
6,https://therbimuseum.rbi.org.in,6,here Accessibility


In [9]:
result_df['MergedContent'] = result_df['MergedContent'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [10]:
vector_df = pd.DataFrame({
    'RowNumber': result_df.index,
    'Content':result_df['MergedContent']
})
print(vector_df)

     RowNumber                                            Content
0            0                                                abc
3            3                                                abc
4            4  press release public urls for mani app radio j...
5            5                                                abc
6            6                                 here accessibility
..         ...                                                ...
501        501                                       date india75
502        502  survey on foreign liabilities and assets fla o...
503        503      coordinated portfolio investment survey india
504        504  biennial survey on foreign collaboration in in...
505        505  annual survey on computer software information...

[420 rows x 2 columns]


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv= CountVectorizer(max_features=5000,stop_words="english")

In [13]:
import nltk

In [14]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()

In [15]:
def stem(text):
    y=[]
    for i in text.split():
    
        y.append(ps.stem(i))
    return " ".join(y)
        
        


In [16]:
vector_df['Content']=vector_df['Content'].apply(stem)

In [17]:
raw_documents = vector_df['Content'].tolist()

In [18]:
vectorized_data = cv.fit_transform(raw_documents).toarray()

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



def find_similar_documents(input_text, cv, vectorized_data, result_df):
  
    input_vector = cv.transform([input_text]).toarray()

    similarities = cosine_similarity(input_vector, vectorized_data)
    top_indices = np.argsort(similarities[0])[::-1][:5]
    similar_documents = result_df.iloc[top_indices]

    return similar_documents



In [27]:
input_text = "Special Long-Term Repo Operations (SLTRO) for Small Finance Banks (SFBs)"
similar_documents = find_similar_documents(input_text, cv, vectorized_data, result_df)


print(similar_documents)

                                                                       URL  \
124                     https://www.rbi.org.in/Scripts/FAQView.aspx?Id=134   
344                https://www.rbi.org.in/scripts/FS_FAQs.aspx?Id=134&fn=6   
106  https://www.rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=49628   
110  https://www.rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=49583   
103  https://www.rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=49671   

     RowNumber  \
124        124   
344        344   
106        106   
110        110   
103        103   

                                                                  MergedContent  
124                                   targeted long term repo operations tltros  
344                                   targeted long term repo operations tltros  
106   date reserve bank announces third targeted long term repo operation tltro  
110       date reserve bank announces targeted long term repo operations tltros  
103  date re

In [22]:
pd.set_option('display.max_colwidth', None)


In [23]:
result_df.iloc[276]

URL              https://www.rbi.org.in/scripts/FS_Speeches.aspx?Id=1359&fn=6
RowNumber                                                                 336
MergedContent         date financial sector as an enabler for developed india
Name: 336, dtype: object

In [25]:
row_number_to_access = 276
desired_row = result_df.loc[result_df['RowNumber'] == row_number_to_access]


print(desired_row)

                                                                   URL  \
276  https://rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx?prid=38524   

     RowNumber  \
276        276   

                                                                 MergedContent  
276  date rbi issues 500 banknotes inset letter e in mahatma gandhi new series  
