In [None]:
import pandas as pd                # Pandas is essential for dealing with tabular data structures  
import numpy as np                 # Importing numpy for mathematical analysis 
np.random.seed(30)                 # Generated Pseudo Random Number 
from scipy import stats            # Used to deal with statistics and probability
import datetime as dt              # Importing datetime class to create dates 
from pymongo import MongoClient    # Used to interact with the Mongo Database i.e, a NO-SQL based databased for storing non-relational data

In [None]:
# Creating a new MongoClient instance to interact with the data storage server # https://mongodb.github.io/node-mongodb-native/api-generated/mongoclient.html
client = MongoClient("mongodb://localhost:27017/")      

# Extracting the News Recommendor Database from the data server
abc = client["News_Recommender"]  

In [None]:
# Extracting the articles from the news recommendor 
articles = abc["articles"] 

# Converting Article -> List -> CSV dataframe 
# https://www.w3schools.com/python/ref_string_find.asp
corpus_df = pd.DataFrame(list(articles.find())) 

In [None]:
# Extracting the row corresponding to the input date and time from the corpus_df i.e, the corpus of news articles
corpus_df[corpus_df["Datetime"] == "2021-01-28 07:35:55"]

Unnamed: 0,_id,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link,Mean_Time,article_idx
0,606b292e3c6b2f2a08aad299,2021-01-28 07:35:55,Sports,Badminton,Patient pays… almost: Srikanth tries to stay i...,Kidambi Srikanth gritted it out and though he ...,In Test cricket’s season of spectacular stubbo...,Shivani naik,https://indianexpress.com/article/sports/badmi...,65,Sp_0


In [None]:
# Replacing the category object "News" with "Miscellaneous" in the Category column of corpus_df dataframe
# Reference - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
corpus_df['Category'] = corpus_df['Category'].replace(['News'],'Miscellaneous')

def modify(corpus_df): 

    # Setting Mean_Time column for all rows in corpus_df to 0
    corpus_df['Mean_Time']= 0

    # Defining an empty list for later use
    artind=[]

    # Assigning empyty value to article id for every row
    corpus_df['article_idx']= "  "

    # Running a loop through every entry/row in the corpus_df table
    for i in corpus_df.index:

        # For every entry, assign the 1/4th of the length of summary corresponding to that entry to the "Mean_Time" of that entry
        corpus_df['Mean_Time'][i]= float(len(corpus_df['Summary'][i])/4)

        # For every entry in the 'Category' Column, extracting the first two words of that entry, adding entry number to it & appending it to artind list
        # https://stackoverflow.com/questions/5234090/how-to-take-the-first-n-items-from-a-generator-or-list 
        artind.append(str(corpus_df['Category'][i][:2])+'_'+str(i))
    
    # Replacing the previous article id with the entries in 'artind' list 
    corpus_df['article_idx']=artind

    # Extracting the unique entries in the category column & number of times those entries are repeated and assigning them to 'unique' and 'counts' variable
    # https://numpy.org/doc/stable/reference/generated/numpy.unique.html
    (unique, counts) = np.unique(corpus_df['Category'], return_counts=True)

    # If the length of the 'unique' list is > 10 
    if len(unique)>10:

        # reverse the order of list of counts i.e, from increasing to decreasing and assign it to 'order' list
        order=sorted(list(counts),reverse=True)

        # ignore the first 10 elements of the order list and assign it to 'merge' list
        merge=order[10:]

        # run a loop 'i' to the length of the 'merge' list
        for i in range(len(merge)):

          # run a loop 'j' to the length of the 'counts' list
            for j in range(len(counts)):

                # if at any point they are equal
                if merge[i]==counts[j]:

                    # replace whatever entry in 'Category' column in the corpus with 'Miscellanous' wherever that entry matches with the jth unique entry
                    corpus_df['Category']=corpus_df['Category'].replace([unique[j]],'Miscellaneous')

    return corpus_df

In [None]:
# not able to figure out what this means
# Usually you add/remove/update rows and columns in the dataframe, just 'modify()' is quite vague
# but I have a guess, it's related to the update method : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.update.html
reform_corpus_df = modify(corpus_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df['Mean_Time'][i]= float(len(corpus_df['Summary'][i])/4)


In [None]:
# Since the structure of 'reform_corpus_df' is the same as 'corpus_df', I think modify function did nothing. 
reform_corpus_df

Unnamed: 0,_id,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link,Mean_Time,article_idx
0,606b292e3c6b2f2a08aad299,2021-01-28 07:35:55,Sports,Badminton,Patient pays… almost: Srikanth tries to stay i...,Kidambi Srikanth gritted it out and though he ...,In Test cricket’s season of spectacular stubbo...,Shivani naik,https://indianexpress.com/article/sports/badmi...,65,Sp_0
1,606b292e3c6b2f2a08aad29a,2021-01-27 16:51:08,Sports,Badminton,BWF World Tour Finals: Fighting PV Sindhu lose...,This was PV Sindhu's 16th defeat to Tai Tzu Yi...,World champion shuttler P V Sindhu went down f...,Pti,https://indianexpress.com/article/sports/badmi...,41,Sp_1
2,606b292e3c6b2f2a08aad29b,2021-01-27 08:30:22,Sports,Badminton,"World Tour Finals Preview: PV Sindhu, recharge...",With the Indian having played more matches tha...,Carolina Marin (50 total) played 24 tournament...,Shivani naik,https://indianexpress.com/article/sports/badmi...,28,Sp_2
3,606b292e3c6b2f2a08aad29c,2021-01-24 19:03:06,Sports,Badminton,Satwiksairaj’s offence gets neutralised by sav...,Satwiksairaj Rankireddy uses big smash to kill...,One would have to be blind to not figure that ...,Shivani naik,https://indianexpress.com/article/sports/badmi...,49,Sp_3
4,606b292e3c6b2f2a08aad29d,2021-01-23 19:58:44,Sports,Badminton,Dream run of Indian doubles pairs end with sem...,"Up against the world number three Thai pair, S...",The Indian mixed doubles pair of Satwiksairaj ...,Pti,https://indianexpress.com/article/sports/badmi...,52,Sp_4
...,...,...,...,...,...,...,...,...,...,...,...
22748,606b29323c6b2f2a08ab2b75,2021-02-06 00:17:00,Entertainment,television,"Bigg Boss 14 February 5, 2021, Written Update:...",Bigg Boss 14: Devoleena lashed out at Arshi fo...,"On the 125th day in the Bigg Boss house, Devol...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/bigg-boss-1...,42,En_22748
22749,606b29323c6b2f2a08ab2b76,2021-02-03 12:23:00,Entertainment,television,A Tour Of Shaheer Sheikh And Ruchikaa Kapoor's...,Pictures from Shaheer and Ruchikaa's swanky ap...,"TV actors Shaheer Sheikh and Ruchikaa Kapoor, ...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/a-tour-of-s...,21,En_22749
22750,606b29323c6b2f2a08ab2b77,2021-02-01 23:56:00,Entertainment,television,"Bigg Boss 14: Nikki Tamboli, Rahul Vaidya's Fi...","After a war of words, Nikki Tamboli agreed to ...",Monday's episode of Bigg Boss 14 was fulll of ...,Nilanjana Basu,https://www.ndtv.com/entertainment/bigg-boss-1...,18,En_22750
22751,606b29323c6b2f2a08ab2b78,2021-01-29 14:28:00,Entertainment,television,Identify The Comedy Star In This Pic From 28 Y...,The young Kapil Sharma in the photo is barely ...,"Every now and then, actor-comedian Kapil Sharm...",Nilanjana Basu,https://www.ndtv.com/entertainment/heres-what-...,31,En_22751


In [None]:
# Extracting the unique values from the modified corpus 
# .unique returns the unique values in the input data structure but doesn't order when returning
# https://pandas.pydata.org/docs/reference/api/pandas.unique.html
reform_corpus_df["Category"].unique()

array(['Sports', 'Business', 'Technology', 'Entertainment',
       'Miscellaneous', 'Society', 'India', 'World', 'Lifestyle',
       'Education'], dtype=object)

In [None]:
# Defining a dictionary to contain all the unique categories obtained above
cluster_idx_dict = {1: "Entertainment", 2: "Sports", 3: "Technology", 4: "Business", 5: "World", 6: "India",
                    7: "Society", 8: "Education", 9: "Lifestyle", 10: "Miscellaneous"}

In [None]:
# Assigning the column data of the earlier corpus to the updated corpus
reform_corpus_df = pd.DataFrame(columns = corpus_df.columns)
# initializing a list 'frames' for later use
frames = list()

for i in range(1, 11):
    # Assigning the ith value of the dictionary 'cluster_idx_dict' to 'clu' variable
    clu = cluster_idx_dict[i]
    print(clu)
    
    frames.append(corpus_df[corpus_df["Category"] == clu].sort_values(by = "Datetime", ignore_index = True))
reform_corpus_df = pd.concat(frames, keys = [i for i in range(1, 11)], names = ["clu_idx", "art_idx"])

Entertainment
Sports
Technology
Business
World
India
Society
Education
Lifestyle
Miscellaneous


In [None]:
reform_corpus_df

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link,Mean_Time,article_idx
clu_idx,art_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,606b292f3c6b2f2a08aae1f7,2015-09-04 23:31:00,Entertainment,gogglebox-uk,The girl who makes you giggle on Gogglebox: He...,Gogglebox returns to our screens this Friday a...,"Across the nation, sofas are being plumped, ke...",By Jan Moir for the Daily Mail,https://www.dailymail.co.uk/debate/article-322...,24,En_3934
1,1,606b292f3c6b2f2a08aae1f6,2016-08-02 00:32:00,Entertainment,gogglebox-uk,Red-faced Gogglebox stars apologise for ‘light...,"Umar, 38, Raza, 31, and Baasit Siddiqui, 33, l...",The British Asian stars of Gogglebox have apol...,"By Abe Hawken and Martin Robinson, Uk Chief Re...",https://www.dailymail.co.uk/news/article-37190...,50,En_3933
1,2,606b292f3c6b2f2a08aae1f5,2016-08-03 08:29:00,Entertainment,gogglebox-uk,EXCLUSIVE: 'They are very silly boys!' Goggleb...,The mother of three Gogglebox stars who joked ...,Silly: The mother of the British Asian stars o...,By Tracey Kandohla For Mailonline,https://www.dailymail.co.uk/news/article-37200...,46,En_3932
1,3,606b292f3c6b2f2a08aae1f4,2016-08-03 23:18:00,Entertainment,gogglebox-uk,Jeremy Corbyn looks like he lives in a squat a...,If ever there was a programme that illustrated...,If ever there was a programme that illustrated...,By Jim Shelley for MailOnline,https://www.dailymail.co.uk/tvshowbiz/article-...,46,En_3931
1,4,606b29323c6b2f2a08ab2b4b,2017-05-05 17:04:00,Entertainment,hollywood,How Guardians Of The Galaxy Vol 2 Almost Lost ...,In Guardians of the Galaxy Vol 2 opening credi...,"PromotedListen to the latest songs, only on Ji...","Michael Cavna, The Washington Post",https://www.ndtv.com/entertainment/how-guardia...,51,En_22706
...,...,...,...,...,...,...,...,...,...,...,...,...
10,1724,606b29303c6b2f2a08ab067c,2021-03-16 17:58:00,Miscellaneous,,Toddler falls from moving car on busy road in ...,In a video that has gone viral on social media...,It was nothing short of a shocking moment for ...,Raya ghosh,https://www.indiatoday.in/trending-news/story/...,30,Tr_13283
10,1725,606b29303c6b2f2a08aafdf5,2021-03-16 21:13:00,Miscellaneous,,Fact Check: Viral claim saying all BJP MLAs in...,A viral post on social media claims that Punja...,"The BJP, which has been facing criticism over ...",Dheeshma puzhakkal,https://www.indiatoday.in/fact-check/story/fac...,43,Fa_11100
10,1726,606b29303c6b2f2a08aafdf4,2021-03-16 22:45:00,Miscellaneous,,Fact Check: This newspaper clipping on hike in...,"At a time, the Centre is facing flak over high...","At a time, the Centre is facing flak over high...",Chayan kundu,https://www.indiatoday.in/fact-check/story/fac...,39,Fa_11099
10,1727,606b29303c6b2f2a08ab067b,2021-03-17 09:16:00,Miscellaneous,,Viral video of Bobby Deol's dance moves proves...,A fan account of Bobby Deol shared a compilati...,Internet never fails when it comes to serving ...,Krishna priya pallavi,https://www.indiatoday.in/trending-news/story/...,36,Tr_13282


# Bot-1 (New-User Bot)

In [None]:
time_resol_set = 3600
dt_wt = -0.5/time_resol_set
rat_wt = 1.2
def bot1baseScoring(dt_corpus_df, user_rating_df):
    avg_art_rat = user_rating_df[dt_corpus_df.index].sum(axis = 0)
    time_now = dt.datetime(2021, 3, 17).timestamp() # Temporarily
    # time_now = dt.datetime.now().timestamp()        Actually (Permanently)
    if type(dt_corpus_df) == pd.core.series.Series:
        dt_series = pd.Series([time_now]*len(dt_corpus_df), index = dt_corpus_df.index, name = "Time_Now") - pd.to_datetime(dt_corpus_df).apply(dt.datetime.timestamp)
    else:
        dt_series = pd.Series([time_now]*len(dt_corpus_df), index = dt_corpus_df.index, name = "Time_Now") - pd.to_datetime(dt_corpus_df.Datetime).apply(dt.datetime.timestamp)
    score_df = dt_wt*dt_series + rat_wt*avg_art_rat
    max_idx_list = list()
    max_score_list = list()
    for clu_id, clu_df in score_df.groupby(level = "clu_idx"):
        max_idx = clu_df.idxmax()
        max_idx_list.append(max_idx)
        max_score_list.append(clu_df[max_idx])
    new_user_base_recomm = pd.Series(max_score_list, index = max_idx_list, name = "Max_Scores", dtype = "float64").sort_values(ascending=False)
    return new_user_base_recomm.index

## Bot-1 Trial Run

In [None]:
trial_user_rating_df = pd.DataFrame([[np.random.randint(-1, 2) for i in range(len(reform_corpus_df))] for user in range(50)], columns = reform_corpus_df.index)

In [None]:
trial_user_rating_df

clu_idx,1,1,1,1,1,1,1,1,1,1,...,10,10,10,10,10,10,10,10,10,10
art_idx,0,1,2,3,4,5,6,7,8,9,...,1719,1720,1721,1722,1723,1724,1725,1726,1727,1728
0,0,0,0,0,-1,-1,1,0,0,1,...,-1,1,-1,-1,0,1,1,1,0,0
1,-1,-1,0,1,0,0,1,1,1,-1,...,-1,1,-1,-1,0,0,0,-1,0,0
2,0,1,-1,-1,1,0,-1,-1,0,-1,...,1,1,0,1,1,0,1,1,1,0
3,-1,0,1,1,0,-1,1,0,1,0,...,0,0,-1,0,1,0,-1,1,1,0
4,0,-1,0,-1,1,-1,-1,-1,-1,1,...,-1,0,0,1,1,0,1,-1,1,0
5,-1,0,1,-1,-1,0,-1,-1,0,-1,...,1,0,-1,1,-1,0,-1,-1,-1,0
6,1,1,-1,0,0,1,-1,-1,-1,0,...,-1,0,1,1,0,0,-1,0,-1,1
7,1,0,-1,1,1,0,-1,0,0,1,...,0,1,1,0,0,0,1,1,1,0
8,0,-1,-1,1,-1,-1,0,0,1,0,...,0,1,1,-1,-1,0,0,0,1,-1
9,0,1,-1,0,0,0,-1,0,-1,-1,...,-1,0,-1,1,1,0,1,-1,1,1


In [None]:
reform_corpus_df.loc[bot1baseScoring(reform_corpus_df, trial_user_rating_df)]


Unnamed: 0_level_0,Unnamed: 1_level_0,_id,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link,Mean_Time,article_idx
clu_idx,art_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,4552,606b29323c6b2f2a08ab2a88,2021-03-19 17:45:00,Entertainment,bollywood,"Popular Opinion: Who Cares About ""Real Or Fake...",We sort of agree with the Internet,A thought that caught Hrithik Roshan's attenti...,Pranita Chaubey,https://www.ndtv.com/entertainment/popular-opi...,8,En_22511
4,2787,606b29323c6b2f2a08ab2a21,2021-03-19 15:31:00,Business,latest,Aarti Drugs Rallies After Board Approves Rs 60...,"Aarti Drugs share buyback price of Rs 1,000 is...",Shares of drug maker Aarti Druges rose as much...,Abhishek Vasudev,https://www.ndtv.com/business/aarti-drugs-shar...,29,Bu_22408
2,4356,606b29313c6b2f2a08ab1ecd,2021-03-17 16:46:00,Sports,Other-sports,"Von Miller Net Worth, NFL 2021 Salary And Late...","Von Miller net worth: According to sources, th...",The Denver Broncos will apparently be exercisi...,Devika pawar,https://www.republicworld.com/sports-news/othe...,37,Sp_19508
3,3860,606b29313c6b2f2a08ab222f,2021-03-17 15:46:00,Technology,Mobile,Samsung Brings Android 11 And One UI 3.1 Updat...,Samsung is one of the biggest Android smartpho...,Samsung is one of the most reliable and popula...,Sakshat kolhatkar,https://www.republicworld.com/technology-news/...,37,Te_20374
6,1605,606b29313c6b2f2a08ab0c25,2021-03-17 15:04:00,India,Education,NHM MP Recruitment 2021: Apply Now For 102 Vac...,The National Health Mission of Madhya Pradesh ...,NHM MP Recruitment: The National Health Missio...,Disha kandpal,https://www.republicworld.com/india-news/educa...,39,In_14732
5,1797,606b29313c6b2f2a08ab12f6,2021-03-17 17:58:00,World,Rest-of-the-world-news,Blinken Meets SKorean Counterpart Chung Eui-yong,"Fresh off a stop in Tokyo, President Joe Biden...","Fresh off a stop in Tokyo, President Joe Biden...",Associated press television news,https://www.republicworld.com/world-news/rest-...,68,Wo_16477
9,473,606b29303c6b2f2a08ab0193,2021-03-17 10:24:00,Lifestyle,Celebrity,Milind Soman shares loved-up selfie with Ankit...,Ankita Konwar recently shared that she went fo...,Ankita Konwar and Milind Soman never take thei...,Krishna priya pallavi,https://www.indiatoday.in/lifestyle/celebrity/...,44,Li_12026
10,1720,606b29303c6b2f2a08ab07d6,2021-03-16 15:55:00,Miscellaneous,New-launches,"2021 Bentley Bentayga launched in India, price...",The new Bentayga is the first car launched und...,Bentley has been at the forefront of luxury au...,Pratik rakshit,https://www.indiatoday.in/auto/new-launches/st...,45,Au_13629
8,696,606b29303c6b2f2a08ab04a2,2021-03-16 11:36:00,Education,Notification,"NIOS Class 10, 12 results announced, check det...",NIOS has declared the results for the public e...,The National Institute of Open Schooling (NIOS...,India today web desk,https://www.indiatoday.in/education-today/noti...,43,Ed_12809
7,774,606b292f3c6b2f2a08aae381,2021-03-13 01:47:00,Society,politics,Kerala Polls: First ever woman candidate of IU...,As per a study conducted by Inter-Parliamentar...,It took Indian Union Muslim League (IUML) 25 y...,Manpriya Khurana,https://www.ibtimes.co.in/kerala-polls-first-e...,49,So_4328
