In [25]:
import pandas as pd

# import asyncio
# import nest_asyncio
# nest_asyncio.apply()

from request_tv_maze import request_shows, request_search

from sklearn.feature_extraction.text import TfidfVectorizer

# import plotly

In [26]:
def vectorizer_func(df: pd.DataFrame):
    """Function to vectorize genres/columns in the dataframe"""

    vectorizer = TfidfVectorizer(max_features=50, stop_words='english')  # stop_words='english'

    X = vectorizer.fit_transform(df['features'])
    
    return X.toarray(), vectorizer.get_feature_names_out()


In [27]:
pages = 20
limit = 10
extended_level = "full"

In [28]:
shows = request_shows(pages = pages, limit = limit, extended_level=extended_level) # pages X limit

# print(shows)

shows_df = pd.json_normalize(shows)

In [29]:
shows_df.head()

Unnamed: 0,title,language,genres,summary
0,The Day of the Jackal,English,"[drama, action, adventure, mystery]",An unrivalled and highly elusive lone assassin...
1,From,Korean,"[mystery, fantasy, science-fiction, suspense, ...",Unravel the mystery of a nightmarish town in m...
2,Silo,English,"[drama, science-fiction]","In a ruined and toxic future, thousands live i..."
3,Dune: Prophecy,English,"[fantasy, science-fiction, drama, action, adve...",Ten thousand years before the ascension of Pau...
4,Yellowstone,English,"[drama, western]","Follow the violent world of the Dutton family,..."


In [30]:
shows_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     200 non-null    object
 1   language  180 non-null    object
 2   genres    200 non-null    object
 3   summary   199 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [31]:
query = "Dandadan".lower()

media_type = "shows"

new_show = request_search(query, media_type)

new_show_df = pd.json_normalize(new_show)

new_show_df.head()

{'id': 73023, 'url': 'https://www.tvmaze.com/shows/73023/dandadan', 'name': 'Dandadan', 'type': 'Animation', 'language': 'Japanese', 'genres': [], 'status': 'Running', 'runtime': 24, 'averageRuntime': 24, 'premiered': '2024-10-03', 'ended': None, 'officialSite': 'https://anime-dandadan.com', 'schedule': {'time': '00:26', 'days': ['Thursday']}, 'rating': {'average': 7.3}, 'weight': 97, 'network': None, 'webChannel': {'id': 135, 'name': 'AbemaTV', 'country': {'name': 'Japan', 'code': 'JP', 'timezone': 'Asia/Tokyo'}, 'officialSite': None}, 'dvdCountry': None, 'externals': {'tvrage': None, 'thetvdb': 432832, 'imdb': None}, 'image': {'medium': 'https://static.tvmaze.com/uploads/images/medium_portrait/537/1344868.jpg', 'original': 'https://static.tvmaze.com/uploads/images/original_untouched/537/1344868.jpg'}, 'summary': '<p>When high schooler Momo, from a family of spirit mediums, first meets her classmate Okarun, an occult geek, they argue—Momo believes in ghosts but denies aliens, and Okar

Unnamed: 0,title,type,language,genres,summary
0,Dandadan,Animation,Japanese,[],"When high schooler Momo, from a family of spir..."


In [32]:
if new_show["title"] not in shows_df.title:
    shows_df = pd.concat([shows_df, new_show_df], ignore_index=True, axis=0)
    shows_df.tail()

In [33]:
# Combine features while handling NaN values
features = (
        shows_df["type"].fillna("")
        + " "
        + shows_df["language"].fillna("")
        + " "
        + shows_df["summary"].fillna("")
)

# Process genres column
genres_unpacked = shows_df["genres"].apply(
    lambda x: " ".join(x).lower() if isinstance(x, list) else x.lower() if isinstance(x, str) else ""
)

shows_df["genres"] = genres_unpacked

# Combine genres with other features
features += " " + genres_unpacked

print(features)

shows_df["features"] = features
# shows_df.drop(["genres"], axis=1, inplace=True)

0       English An unrivalled and highly elusive lone...
1       Korean Unravel the mystery of a nightmarish t...
2       English In a ruined and toxic future, thousan...
3       English Ten thousand years before the ascensi...
4       English Follow the violent world of the Dutto...
                             ...                        
196      In a mysterious shop that sells lamps, the d...
197     English After the fall of the Galactic Empire...
198     English Hundreds of cash-strapped players acc...
199     English The adventures of a late-20th-century...
200    Animation Japanese When high schooler Momo, fr...
Length: 201, dtype: object


In [34]:
shows_df.head()

Unnamed: 0,title,language,genres,summary,type,features
0,The Day of the Jackal,English,drama action adventure mystery,An unrivalled and highly elusive lone assassin...,,English An unrivalled and highly elusive lone...
1,From,Korean,mystery fantasy science-fiction suspense horro...,Unravel the mystery of a nightmarish town in m...,,Korean Unravel the mystery of a nightmarish t...
2,Silo,English,drama science-fiction,"In a ruined and toxic future, thousands live i...",,"English In a ruined and toxic future, thousan..."
3,Dune: Prophecy,English,fantasy science-fiction drama action adventure,Ten thousand years before the ascension of Pau...,,English Ten thousand years before the ascensi...
4,Yellowstone,English,drama western,"Follow the violent world of the Dutton family,...",,English Follow the violent world of the Dutto...


In [35]:
vectorized_features, feature_names = vectorizer_func(shows_df)

print(feature_names)

print(list(vectorized_features)[0])

vectorized_df = pd.DataFrame(vectorized_features, columns = feature_names)

shows_df_concat = pd.concat([shows_df, vectorized_df], axis=1)

shows_df_concat.head()

['action' 'adventure' 'animation' 'brilliant' 'cases' 'city' 'comedy'
 'country' 'crime' 'criminal' 'day' 'drama' 'english' 'family' 'fantasy'
 'father' 'fbi' 'fiction' 'follow' 'follows' 'future' 'group' 'help'
 'high' 'horror' 'life' 'lives' 'make' 'man' 'mystery' 'new' 'old'
 'people' 'police' 'reality' 'school' 'science' 'series' 'set' 'story'
 'team' 'thriller' 'town' 'war' 'way' 'work' 'world' 'years' 'york'
 'young']
[0.52822352 0.54169886 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.30061367
 0.2503892  0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.52390737
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


Unnamed: 0,title,language,genres,summary,type,features,action,adventure,animation,brilliant,...,team,thriller,town,war,way,work,world,years,york,young
0,The Day of the Jackal,English,drama action adventure mystery,An unrivalled and highly elusive lone assassin...,,English An unrivalled and highly elusive lone...,0.528224,0.541699,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,From,Korean,mystery fantasy science-fiction suspense horro...,Unravel the mystery of a nightmarish town in m...,,Korean Unravel the mystery of a nightmarish t...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.393279,0.0,0.379606,0.0,0.0,0.0,0.0,0.0
2,Silo,English,drama science-fiction,"In a ruined and toxic future, thousands live i...",,"English In a ruined and toxic future, thousan...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Dune: Prophecy,English,fantasy science-fiction drama action adventure,Ten thousand years before the ascension of Pau...,,English Ten thousand years before the ascensi...,0.312665,0.320641,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411837,0.0,0.0
4,Yellowstone,English,drama western,"Follow the violent world of the Dutton family,...",,English Follow the violent world of the Dutto...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.397322,0.0,0.0,0.0


In [36]:
from sklearn.neighbors import NearestNeighbors

knn_df = shows_df_concat.iloc[:, 6:]

rec = NearestNeighbors(metric = 'cosine')
rec.fit(knn_df)

In [37]:
distances, indices = rec.kneighbors(knn_df, n_neighbors=10) # Number of recommendations

In [38]:
print(distances)

[[0.00000000e+00 2.03307605e-01 2.45249713e-01 ... 4.03331811e-01
  4.19446314e-01 4.23131363e-01]
 [0.00000000e+00 2.06674781e-01 3.06487681e-01 ... 4.72083299e-01
  4.72083299e-01 4.89278037e-01]
 [0.00000000e+00 2.35843537e-01 2.74113111e-01 ... 4.45416406e-01
  5.09555068e-01 5.18957295e-01]
 ...
 [0.00000000e+00 2.89634142e-01 3.41606438e-01 ... 4.81118174e-01
  4.88228197e-01 5.08434862e-01]
 [1.11022302e-16 4.36298349e-01 4.85743471e-01 ... 5.21598572e-01
  5.32989527e-01 5.45517888e-01]
 [0.00000000e+00 3.95271749e-01 4.36734103e-01 ... 5.58048162e-01
  5.66678820e-01 6.02889502e-01]]


In [39]:
print(indices)

[[  0  84  43 ...  50  95 180]
 [  1  95 177 ... 134 197 139]
 [  2  96  78 ... 115 152 104]
 ...
 [198  30 104 ...  84 151  43]
 [199  10 168 ...  22  57  96]
 [200  66  35 ... 140 118  15]]


In [40]:
results = shows_df["title"].copy()

for i, (neighbor_indices, neighbor_distances) in enumerate(zip(indices, distances)):
    print(f"Nearest neighbors for '{shows_df['title'].iloc[i]}':")
    for idx, dist in zip(neighbor_indices, neighbor_distances):
        print(f"  Neighbor: {shows_df['title'].iloc[idx]} (Distance: {dist:.2f})")
    print()

Nearest neighbors for 'The Day of the Jackal':
  Neighbor: The Day of the Jackal (Distance: 0.00)
  Neighbor: Outer Banks (Distance: 0.20)
  Neighbor: Lost (Distance: 0.25)
  Neighbor: S.W.A.T. (Distance: 0.26)
  Neighbor: House of the Dragon (Distance: 0.29)
  Neighbor: NCIS: Los Angeles (Distance: 0.29)
  Neighbor: Squid Game (Distance: 0.35)
  Neighbor: The Curse of Oak Island (Distance: 0.40)
  Neighbor: Stranger Things (Distance: 0.42)
  Neighbor: The Umbrella Academy (Distance: 0.42)

Nearest neighbors for 'From':
  Neighbor: From (Distance: 0.00)
  Neighbor: Stranger Things (Distance: 0.21)
  Neighbor: American Horror Story (Distance: 0.31)
  Neighbor: Dark (Distance: 0.32)
  Neighbor: Agatha All Along (Distance: 0.36)
  Neighbor: Teacup (Distance: 0.39)
  Neighbor: Supernatural (Distance: 0.43)
  Neighbor: Star Trek: Deep Space Nine (Distance: 0.47)
  Neighbor: The Mandalorian (Distance: 0.47)
  Neighbor: The Walking Dead: Daryl Dixon (Distance: 0.49)

Nearest neighbors for 'Si

In [41]:
# import pandas as pd
# 
# # Create a DataFrame for visualization
# neighbors_df = pd.DataFrame({
#     "Item": knn_df.index.repeat(indices.shape[1]),
#     "Neighbor": indices.flatten(),
#     "Distance": distances.flatten()
# })
# 
# # Create a pivot table
# pivot_table = neighbors_df.pivot_table(index="Item", columns="Neighbor", values="Distance", aggfunc="mean")
# 
# print(pivot_table)

In [42]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# 
# # Using the pivot table created above
# plt.figure(figsize=(20, 10))
# sns.heatmap(pivot_table, cmap="coolwarm", annot=False, cbar=True)
# plt.title("KNN Distances Heatmap")
# plt.show()

In [43]:
# neighbors_data = []
# 
# for i, (neighbor_indices, neighbor_distances) in enumerate(zip(indices, distances)):
#     neighbors_data.append({
#         "Title": shows_df["title"].iloc[i],
#         "Neighbors": [(shows_df["title"].iloc[idx], dist) for idx, dist in zip(neighbor_indices, neighbor_distances)]
#     })
# 
# neighbors_df = pd.DataFrame(neighbors_data)
# print(neighbors_df)


In [44]:
# from matplotlib import pyplot as plt
# 
# point_index = 0  # Replace with the desired index
# neighbor_titles = [shows_df["title"].iloc[idx] for idx in indices[point_index]]
# neighbor_distances = distances[point_index]
# 
# plt.figure(figsize=(10, 6))
# plt.scatter(neighbor_titles, neighbor_distances, color='skyblue')
# plt.xticks(rotation=45, ha='right')
# plt.xlabel("Neighbor Titles")
# plt.ylabel("Distances")
# plt.title(f"Distances to Nearest Neighbors for '{shows_df['title'].iloc[point_index]}'")
# plt.tight_layout()
# plt.show()


In [45]:
for idx, show_index in enumerate(indices):

    if shows_df.iloc[idx]["title"].lower() != query:  # Use == for comparison
        continue
    
    print(f"Similar items to show {shows_df_concat.iloc[idx]['title']}: ")
    print(str(shows_df_concat.iloc[idx]["features"]))
    print()
    for neighbor_idx, neighbor_distance in zip(show_index[1:], distances[idx][1:]):
        if shows_df_concat.iloc[neighbor_idx]['title'] != shows_df_concat.iloc[idx]['title']:
            print(f"{shows_df_concat.iloc[neighbor_idx]['title']}: {str(shows_df_concat.iloc[neighbor_idx]['genres'])} (Distance: {neighbor_distance:.3f})")
    print()

Similar items to show Dandadan: 
Animation Japanese When high schooler Momo, from a family of spirit mediums, first meets her classmate Okarun, an occult geek, they argue—Momo believes in ghosts but denies aliens, and Okarun believes in aliens but denies ghosts. When it turns out both phenomena are real, Momo awakens a hidden power and Okarun gains the power of a curse. Together, they must challenge the paranormal forces threatening their world. 

American Dad!: animation comedy (Distance: 0.395)
Family Guy: animation comedy (Distance: 0.437)
Yellowstone: drama western (Distance: 0.493)
The Simpsons: animation comedy family (Distance: 0.547)
Bob's Burgers: animation comedy (Distance: 0.549)
King of the Hill: animation comedy family (Distance: 0.557)
Bluey: animation children comedy (Distance: 0.558)
Peaky Blinders: drama crime (Distance: 0.567)
Modern Family: comedy (Distance: 0.603)

