<a href="https://colab.research.google.com/github/ClarenceKaranja/FUTURE-TECH-IMPACT-INDEX/blob/main/RECOMMENDER_SYSTEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print('Downloading punkt...')
    nltk.download('punkt')

try:
    import numpy
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import re
    from nltk.stem.snowball import SnowballStemmer
except ImportError:
    print('You are missing some packages! ' \
          'We will try installing them before continuing!')
    !pip install "numpy" "pandas" "sklearn" "nltk"
    import numpy
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import re
    from nltk.stem.snowball import SnowballStemmer
    import nltk
    nltk.download('punkt')

print('Done!')


In [None]:
# Set up the stemmer
stemmer = SnowballStemmer("english")

# Load the patents data
PATH_PATENTS = "/content/drive/MyDrive/content_recommender_data/ABSTRACTS.csv"
patents = pd.read_csv(PATH_PATENTS)
patents = patents[['ABSTRACTS']].dropna()  # select the 'abstract' column and drop rows with missing values
abstracts = patents['ABSTRACTS'].tolist()  # convert the abstracts to a list

# Clean and tokenize the abstracts
def clean_tokenize(document):
    document = re.sub('[^\w_\s-]', ' ', document)  # remove punctuation marks and other symbols
    tokens = nltk.word_tokenize(document)  # tokenize sentences
    cleaned_abstract = ' '.join([stemmer.stem(item) for item in tokens])  # stem each token
    return cleaned_abstract

cleaned_abstracts = list(map(clean_tokenize, abstracts))

In [5]:

# Get user input for keywords separated by commas
user_keywords = input("Enter keywords separated by commas: ")


Enter keywords separated by commas: doctor


In [6]:

# Split user input into a list of keywords
user_keywords_list = [keyword.strip() for keyword in user_keywords.split(',')]

# Combine keywords into a single string for processing
cleaned_user_keywords = ' '.join(user_keywords_list)

# Process and clean user input keywords
cleaned_user_keywords = clean_tokenize(cleaned_user_keywords)


In [7]:

# Generate TF-IDF matrix for user input keywords and all patents
tfidf_matrix = TfidfVectorizer(stop_words='english', min_df=2)
abstract_tfidf_matrix = tfidf_matrix.fit_transform(cleaned_abstracts)
user_keywords_tfidf_vector = tfidf_matrix.transform([cleaned_user_keywords])


In [8]:

# Calculate cosine similarity between user input keywords and all patents
patents_similarity_score = cosine_similarity(abstract_tfidf_matrix, user_keywords_tfidf_vector)
recommended_patents_id = patents_similarity_score.flatten().argsort()[::-1]


In [9]:

# Define the number of recommendations you want to display
num_recommendations = 5


In [10]:

# Display the top N recommended patents with full abstract content
recommended_patents = patents.loc[patents.index.isin(recommended_patents_id[:num_recommendations]), 'ABSTRACTS']

print(f'\nTop {num_recommendations} Recommended Patents:')
for idx, abstract in zip(recommended_patents.index, recommended_patents):
    print(f'\nPatent ID: {idx}\nAbstract: {abstract}\n{"-"*50}')



Top 5 Recommended Patents:

Patent ID: 41878
Abstract: Doctor blade (8) for an apparatus for additive manufacturing, adapted to move transversally on a platform (6b, 6b&#39;) housing a powder bed (6), in a direction parallel to the plane in which said powder bed (6) lies, wherein said doctor blade (8) is provided with at least one illuminator (52) arranged in the lower part of the doctor blade (8) itself, said at least one illuminator (52) being an emitter with an emission spectrum centered in the spectral region from 300 to 1000 nm.
--------------------------------------------------

Patent ID: 59503
Abstract: A doctor blade assembly for an electrophotographic printer comprises a first bracket having a first width dimension and a first height dimension, a second bracket having a second width dimension and a second height dimension, a doctor blade disposed between the front bracket and the rear bracket, the doctor blade having a cantilever length, the cantilever length varying from a 