In [43]:
import pandas as pd

# Load your dataset
df = pd.read_csv('articles_075040.csv', encoding='latin-1')


In [44]:
df.head()

Unnamed: 0,Title,URL,Content
0,Chevrons lose t20i series opener in bangladesh,https://www.chronicle.co.zw/chevrons-lose-t20i...,ZIMBABWE senior mens national cricket team go...
1,Vice president mohadi arrives for public priva...,https://www.chronicle.co.zw/vice-president-moh...,Rutendo Nyeve and Nqobile Bhebhe Chronicle Wri...
2,President mnangagwa arrives at the zimbabwe mi...,https://www.chronicle.co.zw/president-mnangagw...,"Patrick Chitumba, Online Reporter PRESIDENT Mn..."
3,Four players to serve one psl game suspension ...,https://www.chronicle.co.zw/four-players-to-se...,"Fungai Muderere, Senior Sports Reporter FOUR C..."
4,Divine lunga wins another psl title in south a...,https://www.chronicle.co.zw/divine-lunga-wins-...,"Fungai Muderere, Senior Sports Reporter ZIMBAB..."


In [45]:
print(df.columns)

Index(['Title', 'URL', 'Content'], dtype='object')


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Text preprocessing and Tfidf vectorization
tfidf = TfidfVectorizer(stop_words='english')
df['Content'] = df['Content'].fillna('')  # Fill missing values with empty string
tfidf_matrix = tfidf.fit_transform(df['Content'])

In [47]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [48]:
cosine_sim

array([[1.00000000e+00, 7.46614427e-03, 6.50733847e-03, 6.85188956e-02,
        2.60775204e-02, 6.86641433e-03, 2.30816713e-03, 7.46614427e-03,
        8.56002191e-03, 1.23689172e-02, 2.44003810e-02, 7.56186392e-03,
        2.72660327e-02, 7.59516380e-03, 1.10674502e-02, 1.35399267e-02,
        8.83196429e-03, 8.40153182e-03, 5.52791525e-03, 1.71605374e-02,
        1.58732077e-02],
       [7.46614427e-03, 1.00000000e+00, 6.76621697e-02, 1.43072884e-02,
        0.00000000e+00, 5.28681470e-02, 6.03354045e-01, 1.00000000e+00,
        3.05377423e-02, 3.93323306e-02, 1.52655696e-02, 5.44561364e-02,
        5.70574751e-02, 6.71293051e-02, 1.56361511e-02, 8.92550944e-02,
        1.10826868e-02, 1.33837492e-02, 7.65900758e-02, 3.49066950e-02,
        5.34213968e-04],
       [6.50733847e-03, 6.76621697e-02, 1.00000000e+00, 2.17613072e-02,
        1.50285138e-02, 9.46164861e-01, 4.34119870e-02, 6.76621697e-02,
        1.65556752e-02, 1.27944207e-02, 2.84642407e-02, 2.42331186e-02,
        3.7964

In [50]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Adjust column names to match the DataFrame exactly
    # Use 'Title' instead of 'title'
    try:
        idx = df[df['Title'].str.lower() == title.lower()].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]  # Exclude the original article
        article_indices = [i[0] for i in sim_scores]

        return df['Title'].iloc[article_indices]
    except IndexError:
        return "Title not found in the dataset."
    except Exception as e:
        return str(e)

In [52]:
print(get_recommendations('President mnangagwa presides over parade of regular officer cadet course'))

2     President mnangagwa arrives at the zimbabwe mi...
1     Vice president mohadi arrives for public priva...
7     Vice president mohadi arrives for public priva...
6                 Vp mohadi to headline business indaba
10    Delta corporation launches smart drinking and ...
11    Zimplow holdings mealie brand capacitation pro...
12    Zimbabwe investment summit kicks off in south ...
16                           Econet breakthrough launch
3     Four players to serve one psl game suspension ...
14                        Zimplats retrench 67 in april
Name: Title, dtype: object
