# Load Libraries and data:

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


df = pd.read_csv("../data/news_data_preprocessed_final.csv")

# Drop missing values just in case
df.dropna(subset=['clean_text'], inplace=True)

# Reset index to ensure we can match matrix indices to dataframe rows later
df.reset_index(drop=True, inplace=True)

# Creating the Search Index (vectorization)

In [10]:
# 2. Create the TF-IDF Matrix (The Search Index)
# We use the same parameters as before to limit noise (max_features=5000)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

print("‚è≥ Generating the Search Index (TF-IDF Matrix)...")
# This creates a matrix where Rows = Articles, Columns = Words
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

print(f"‚úÖ Index created. Matrix Shape: {tfidf_matrix.shape}")

‚è≥ Generating the Search Index (TF-IDF Matrix)...
‚úÖ Index created. Matrix Shape: (10456, 5000)


In [20]:
def get_recommendations(query, tfidf_vectorizer, tfidf_matrix, dataset, top_n=3):
    """
    Content-Based Recommendation Function.

    Args:
        query (str): The user's input text (e.g., "I am interested in space exploration").
        tfidf_vectorizer: The tool to convert text to numbers.
        tfidf_matrix: The database of all article vectors.
        dataset: The original pandas dataframe to retrieve titles.
        top_n (int): Number of recommendations to return.

    Returns:
        DataFrame: Top N recommended articles with similarity scores.
    """

    # 1. Transform the user query into a vector
    # Note: We use .transform(), NOT .fit_transform(), because the vocabulary is already fixed.
    query_vec = tfidf_vectorizer.transform([query])

    # 2. Calculate Cosine Similarity
    # Compare the query vector against ALL article vectors in the matrix
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix)

    # 3. Get the indices of the most similar articles
    # flatten() converts the result into a simple str
    # argsort() gives us the indices that would sort the array  (small -> big)
    # [-top_n:] takes the last N elements (which are the highest scores)
    # [::-1] reverses them so the highest is first
    top_indices = similarity_scores.flatten().argsort()[-top_n:][::-1]

    # 4. Retrieve the results
    results = dataset.iloc[top_indices][['title', 'category_level_1']].copy()
    results['similarity_score'] = similarity_scores.flatten()[top_indices]

    return results

In [22]:
# --- TEST THE RECOMMENDER ---

# Example 1: User likes Technology and Space
user_query = "I love education hating science and technology, what i should read?"

print(f"üîç User Search: '{user_query}'\n")
recommendations = get_recommendations(user_query, tfidf, tfidf_matrix, df, top_n=3)

# Display pretty results
print("üöÄ Top Recommended Articles:")
display(recommendations)

üîç User Search: 'I love education hating science and technology, what i should read?'

üöÄ Top Recommended Articles:


Unnamed: 0,title,category_level_1,similarity_score
7387,Why Science? Employers across all sectors valu...,education,0.325424
7647,EdTechX: The role of AI and deep learning in t...,education,0.2794
192,Today‚Äôs Schools Are Yesterday‚Äôs Streetcars: Ho...,education,0.275621
