In [None]:
import numpy as np

doc1 = np.array([1,1,1,1,1,0,0,0,0,0,0,0,0])
doc4 = np.array([0,0,0,1,1,0,0,0,0,0,0,1,1])

dot_product = np.dot(doc1, doc4)
print(dot_product)

2


## Numpy - Data creation: Numerical Python
## Pandas - Data manipulation: filtering, grouping, indexing, slicing, 2D (structured data)
## Matplotlib - Data visualization
## Sci-kit learn - Predictive maintaintain - ML algorithm

In [None]:
#!pip install scikit-learn

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
data = {
    "doc_id": [1, 2, 3, 4],
    "text": [
        "ninja fighting power friendship adventure training",
        "fighting power supersaiyan soul",
        "fighting reaper sword spirit",
        "friendship adventure pirate treasure"
    ]
}
df = pd.DataFrame(data) # create row-column visual

In [None]:
#df = pd.read_csv(".csv")
# pd.read_xlsx("")

In [None]:
df

Unnamed: 0,doc_id,text
0,1,ninja fighting power friendship adventure trai...
1,2,fighting power supersaiyan soul
2,3,fighting reaper sword spirit
3,4,friendship adventure pirate treasure


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["text"])

In [None]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 18 stored elements and shape (4, 13)>
  Coords	Values
  (0, 3)	0.4838099584718287
  (0, 1)	0.3088096294175862
  (0, 5)	0.3814413283612338
  (0, 2)	0.3814413283612338
  (0, 0)	0.3814413283612338
  (0, 11)	0.4838099584718287
  (1, 1)	0.3667466683744504
  (1, 5)	0.4530050977381881
  (1, 9)	0.5745795256791941
  (1, 7)	0.5745795256791941
  (2, 1)	0.3457831381910465
  (2, 6)	0.5417361046803605
  (2, 10)	0.5417361046803605
  (2, 8)	0.5417361046803605
  (3, 2)	0.43779123108611473
  (3, 0)	0.43779123108611473
  (3, 4)	0.5552826649411127
  (3, 12)	0.5552826649411127


In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
feature_names

array(['adventure', 'fighting', 'friendship', 'ninja', 'pirate', 'power',
       'reaper', 'soul', 'spirit', 'supersaiyan', 'sword', 'training',
       'treasure'], dtype=object)

In [None]:
for i, (row, col) in enumerate(zip(*tfidf_matrix.nonzero())):
    print(f"({row}, {feature_names[col]}) {tfidf_matrix[row, col]}")

(0, ninja) 0.4838099584718287
(0, fighting) 0.3088096294175862
(0, power) 0.3814413283612338
(0, friendship) 0.3814413283612338
(0, adventure) 0.3814413283612338
(0, training) 0.4838099584718287
(1, fighting) 0.3667466683744504
(1, power) 0.4530050977381881
(1, supersaiyan) 0.5745795256791941
(1, soul) 0.5745795256791941
(2, fighting) 0.3457831381910465
(2, reaper) 0.5417361046803605
(2, sword) 0.5417361046803605
(2, spirit) 0.5417361046803605
(3, friendship) 0.43779123108611473
(3, adventure) 0.43779123108611473
(3, pirate) 0.5552826649411127
(3, treasure) 0.5552826649411127


In [None]:
display_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
display_df

Unnamed: 0,adventure,fighting,friendship,ninja,pirate,power,reaper,soul,spirit,supersaiyan,sword,training,treasure
0,0.381441,0.30881,0.381441,0.48381,0.0,0.381441,0.0,0.0,0.0,0.0,0.0,0.48381,0.0
1,0.0,0.366747,0.0,0.0,0.0,0.453005,0.0,0.57458,0.0,0.57458,0.0,0.0,0.0
2,0.0,0.345783,0.0,0.0,0.0,0.0,0.541736,0.0,0.541736,0.0,0.541736,0.0,0.0
3,0.437791,0.0,0.437791,0.0,0.555283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555283


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

(4, 4)

In [None]:
cosine_df = pd.DataFrame(cosine_sim, index=df["doc_id"], columns=df["doc_id"])

In [None]:
cosine_df

doc_id,1,2,3,4
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,0.28605,0.106781,0.333983
2,0.28605,1.0,0.126815,0.0
3,0.106781,0.126815,1.0,0.0
4,0.333983,0.0,0.0,1.0


In [None]:
def recommend(doc_id, top_n=2):
    scores = cosine_df.loc[doc_id].sort_values(ascending=False)
    recommended = scores.iloc[0:top_n+1]
    return recommended

In [None]:
print(recommend(1, top_n=2))

doc_id
1    1.000000
4    0.333983
2    0.286050
Name: 1, dtype: float64
