In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd


data = pd.read_csv('/content/drive/MyDrive/Summer_group_project/updated_file_with_keyphrases.csv')

print(data.head())


                                               Title  \
0                      Suresh Gopi: The Saffron star   
1                   Geniben Thakor: The giant killer   
2           Chandrashekhar Azad: The Dalit commander   
3  INDIA bloc leaders to jointly decide on attend...   
4     N. Chandrababu Naidu: Return of the kingmaker    

                                                Link  \
0  https://www.thehindu.com/news/national/suresh-...   
1  https://www.thehindu.com/news/national/geniben...   
2  https://www.thehindu.com/news/national/chandra...   
3  https://www.thehindu.com/news/national/india-b...   
4  https://www.thehindu.com/news/national/n-chand...   

                                         Description  \
0  The actor-turned politician made history in Ke...   
1  The Congress leader denied the BJP a hat-trick...   
2  The leader of the Azad Samaj Party, who won fr...   
3  Sources confirm to The Hindu that by June 8 ev...   
4  The ‘CEO of Andhra Pradesh’ is back in powe

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer


data['text_features'] = data['Keyphrases'] + ' ' + data['Category']


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['text_features'])

print(tfidf_matrix.shape)


(500, 2474)


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Convert TF-IDF matrix to dense format
X = tfidf_matrix.toarray()

# Create dummy target variable (in practice, this should be user interaction data)
y = np.random.randint(0, 2, X.shape[0])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x781b003bbc10>

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_articles(article_index, num_recommendations=5):
    # Get the feature vector for the input article
    article_vector = X[article_index].reshape(1, -1)

    # Compute cosine similarity between the input article and all other articles
    similarities = cosine_similarity(article_vector, X)

    # Get the indices of the most similar articles
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]

    # Return the recommended articles
    return data.iloc[similar_indices]

# Example: Recommend articles similar to the first article
recommended_articles = recommend_articles(498, num_recommendations=5)
print(recommended_articles[['Title', 'Description']])


                                                 Title  \
482  Archives in the atelier | Fashion designers in...   
476  Falguni and Shane Peacock show us around their...   
429  Fashion label Verandah is India’s first brand ...   
420  Kalaripayattu to Raja Ravi Varma... this Kochi...   
407  Breitling’s CEO on the brand’s new watch techn...   

                                           Description  
482  Textile has been doing it for a while, but now...  
476  Falguni and Shane Peacock’s latest flagship st...  
429  The brand has joined luxury brands like Louis ...  
420  The Kochi-based clothing brand taps into Keral...  
407  What makes the 140-year-old watch brand resona...  


In [13]:
def search_articles_by_query(query, num_recommendations=5):
    # Convert the query to lower case for case insensitive matching
    query = query.lower()

    # Search for articles where keyphrases contain the query
    matched_articles = data[data['Keyphrases'].str.contains(query, case=False, na=False)]

    # If the number of matched articles is more than the required recommendations, return only the top ones
    if len(matched_articles) > num_recommendations:
        matched_articles = matched_articles.head(num_recommendations)

    return matched_articles

# Example: Search for articles based on a user query
user_query = "bengaluru"
recommended_articles = search_articles_by_query(user_query, num_recommendations=5)
print(recommended_articles[['Title', 'Description', 'Keyphrases']])

                                                 Title  \
18   It is people’s aspiration to see me in the Uni...   
326     TNCA plans to build a state-of-the-art complex   
364  Wolvaardt to lead SA in Test, ODIs during Indi...   
431  Textile waste turns into felt courtesy the Has...   
498  Enter Biologique Recherche’s first Ambassade i...   

                                           Description  \
18   The MP-elect from Bengaluru Rural says his vic...   
326  The idea is to build four grounds in the same ...   
364  The tour will start with three ODIs in Bengalu...   
431  How two organisations have come together to co...   
498  The luxury skincare brand has opened its first...   

                                            Keyphrases  
18   mp-elect, bengaluru, bengaluru rural, bengalur...  
326  like ksca, ksca, alur, alur grounds, alur grou...  
364  tour, bengaluru, chennai, one-off, one-off tes...  
431  convert bengaluru, bengaluru, textile, textile...  
498  luxury skinc