In [1]:
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def main():

    articles_df = pd.read_csv('C:/Users/ARCHANA/Downloads/Article Recommondation/shared_articles.csv')
   
    articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
   
    articles_df = articles_df[articles_df['lang'] == 'en']
   
    articles_df = pd.DataFrame(articles_df, columns=['contentId', 'authorPersonId', 'url', 'title', 'text'])

    articles_df['soup'] = articles_df.apply(create_soup, axis=1)
 
    # Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
    tfidf = TfidfVectorizer(stop_words='english')

    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(articles_df['text'])

    #to know the similarity b/w vectors
    #Smaller the angle higher the cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix, True)
   
    # Construct a reverse map of indices and movie titles
    # Reset index of your main DataFrame and construct reverse mapping as before
    
    metadata = articles_df.reset_index()
    
    # indices = pd.Series(metadata.index, index=metadata['title'])
    indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

    print(articles_df['title'])

    get_a_title = input()
    
    print(get_recommendations(get_a_title, indices, cosine_sim,metadata))

#Beautiful Soup is a Python package for parsing HTML and XML documents. 
def create_soup(x):
    soup = ' '.join(x['text'])
    return soup

# Function that takes in Article title as input and outputs most similar movies
def get_recommendations(title, indices, cosine_sim, data):
    # Get the index of the article that matches the title
    idx = indices[title]
  
    # Get the pairwsie similarity scores of all Articles with that Article
    sim_scores = list(enumerate(cosine_sim[idx]))
  
    # Sort the articles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  
    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[1:11] #sim-scores[0] --> it is the article itself.
  
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores] #i(index, sim_score) --> so, i[0]--> index

    # Return the top 10 most similar movies
    return (data['title'].iloc[movie_indices], data['url'].iloc[movie_indices])  #set of all article titles are recommonded.

if __name__ == '__main__':
    main()


1       Ethereum, a Virtual Currency, Enables Transact...
2       Bitcoin Future: When GBPcoin of Branson Wins O...
3                            Google Data Center 360° Tour
4       IBM Wants to "Evolve the Internet" With Blockc...
5       IEEE to Talk Blockchain at Cloud Computing Oxf...
                              ...                        
3116     Swift Top 10 Articles For The Past Year (v.2017)
3118    Amazon takes on Skype and GoToMeeting with its...
3119                          Code.org 2016 Annual Report
3120    JPMorgan Software Does in Seconds What Took La...
3121                 The 2017 Acquia Partners of the Year
Name: title, Length: 2211, dtype: object
Google Data Center 360° Tour
(121     Google shares data center security and design ...
714     YouTube's New Messenger Means You'll Never Hav...
538                          This year's Founders' Letter
207     Google Cloud Platform: The smart person's guid...
439     Top 5 GCP NEXT breakout sessions on YouTube (s...
6