In [1]:
import pandas as pd
import numpy as np


### Data reading

In [2]:
df=pd.read_csv('podcasts_csv.csv')
df.head()

Unnamed: 0,uuid,title,image,description,language,categories,website,author,itunes_id
0,8d62d3880db2425b890b986e58aca393,"Ecommerce Conversations, by Practical Ecommerce",http://is4.mzstatic.com/image/thumb/Music6/v4/...,Listen in as the Practical Ecommerce editorial...,English,Technology,http://www.practicalecommerce.com,Practical Ecommerce,874457373
1,cbbefd691915468c90f87ab2f00473f9,Eat Sleep Code Podcast,http://is4.mzstatic.com/image/thumb/Music71/v4...,On the show we’ll be talking to passionate peo...,English,Tech News | Technology,http://developer.telerik.com/,Telerik,1015556393
2,73626ad1edb74dbb8112cd159bda86cf,SoundtrackAlley,http://is5.mzstatic.com/image/thumb/Music71/v4...,A podcast about soundtracks and movies from my...,English,Podcasting | Technology,https://soundtrackalley.podbean.com,Randy Andrews,1158188937
3,0f50631ebad24cedb2fee80950f37a1a,The Tech M&A Podcast,http://is1.mzstatic.com/image/thumb/Music71/v4...,The Tech M&A Podcast pulls from the best of th...,English,Business News | Technology | Tech News | Business,http://www.corumgroup.com,Timothy Goddard,538160025
4,69580e7b419045839ca07af06cf0d653,"The Tech Informist - For fans of Apple, Google...",http://is4.mzstatic.com/image/thumb/Music62/v4...,The tech news show with two guys shooting the ...,English,Gadgets | Tech News | Technology,http://techinformist.com,The Tech Informist,916080498


# Preprocessing

In [3]:
df.shape

(121175, 9)

In [4]:
df.columns

Index(['uuid', 'title', 'image', 'description', 'language', 'categories',
       'website', 'author', 'itunes_id'],
      dtype='object')

In [5]:
# removing unwanted data
df2=df[['uuid','title','description','language','categories']]


In [6]:
# ealing with null values
df2.isnull().sum()

uuid              0
title             2
description    1343
language          0
categories        0
dtype: int64

In [7]:
df2.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.dropna(inplace=True)


In [8]:
df2.isnull().sum()

uuid           0
title          0
description    0
language       0
categories     0
dtype: int64

In [9]:
df2.shape

(119830, 5)

In [10]:
# manipulating duplicate data
df3=df2.drop_duplicates(subset='title', keep='first')
df3.shape

(117049, 5)

### cleaning our data

In [11]:
df3=df3[df3.title != 'No Title']

In [12]:
df3=df3[~df3['title'].str.contains('<', case=False)]

In [13]:
# we keep only english podcasts
df3=df3[df3['language'] == 'English']

In [14]:
# in this step we remove all rows data have title containing alphabets other than english ones like arabis, chinese...etc
english_alphabet_pattern =r'^[a-zA-Z0-9*|\-+#~()éèêôû`,.:;’»<>/!?&_@%–\[\]\'\"\s]+$'
df4=df3[df3['title'].str.contains(english_alphabet_pattern, case=False, na=False)]


In [15]:
df4=df4[~df4['title'].str.contains('www', case=False, na=False)]


In [16]:
df4.sample(11)


Unnamed: 0,uuid,title,description,language,categories
60870,37ba4dd2acad4e998e20602210e6c43a,Podcasts – Resurgence | Official Website,"In the summer of 2008, in Edmonton, Alberta, C...",English,Religion & Spirituality | Christianity | Arts
38280,e33a2dc009894a64b9e465b24a3c13ed,Sermons – Pine Valley Church,the message and ministry podcast of Pine Valle...,English,Christianity | Religion & Spirituality
4895,8ca13784ab1f470aa686ca772e051901,Backyard Booking Podcast,A weekly podcast hosted by Jake and Thoma. Thi...,English,Video Games | Games & Hobbies | Sports & Recre...
74780,c55f6b9eec054a43b259f2e779881478,Wildfood from the Rangelands,Sustainable agriculture meets Aboriginal land ...,English,Natural Sciences | Comedy | Personal Journals ...
4815,a79ac4eec52a4bf4ab4a0d6645f97bb4,52weeks52books52women,"So many books, so little time. Companion podc...",English,Arts
76718,65f4f69f004b4434b30563e90b07ac5e,That Drummer Guy,That Drummer Guy Presents\n\nEVERYDAY at 4-6AM...,English,Society & Culture
15850,57980f7a30bc42609db06729c288c647,BocaLead Podcast,BocaLead works with the business community to ...,English,Business | Religion & Spirituality | Careers
45776,78e59df65d88438887e98605deac9842,Redeemer City Church - Sunday Messages,Weekly Podcast from Redeemer City Church. Rede...,English,Christianity | Religion & Spirituality
37463,60d75f9e7c0c48c3bc7ccb8b559790c5,Wednesday Night Gentlemen Official Podcast,"A show about movies, tv shows, comic books, vi...",English,TV & Film
46761,a82c2840c0b448be805f3d1d15e8cb95,Let Me Ask You A Question,A show where we always pose various inquiries ...,English,Personal Journals | Society & Culture | Perfor...


In [17]:
df4['categories'] = df4['categories'].apply(lambda val: val.lower())

In [18]:
df4.categories.value_counts()

religion & spirituality | christianity                                                     8533
christianity | religion & spirituality                                                     7032
music                                                                                      5406
comedy                                                                                     5016
society & culture                                                                          4315
                                                                                           ... 
other | religion & spirituality | spirituality | society & culture | science & medicine       1
literature | society & culture | arts | education | performing arts                           1
business | arts | food | society & culture                                                    1
technology | gadgets | games & hobbies | video games | other games                            1
tv & film | arts | podcasting | technolo

In [19]:
df4['categories'] = df4['categories'].apply(lambda val: val.replace('&','|'))

In [20]:
df4.categories.value_counts()

religion | spirituality | christianity                                                     8533
christianity | religion | spirituality                                                     7032
music                                                                                      5406
comedy                                                                                     5016
society | culture                                                                          4315
                                                                                           ... 
other | religion | spirituality | spirituality | society | culture | science | medicine       1
literature | society | culture | arts | education | performing arts                           1
business | arts | food | society | culture                                                    1
technology | gadgets | games | hobbies | video games | other games                            1
tv | film | arts | podcasting | technolo

In [21]:
df4['categories'] = df4['categories'].apply(lambda val: 'religion | spirituality | christianity' if val == 'christianity | religion | spirituality' else val)

In [22]:
df4.shape

(93792, 5)

In [23]:
df5=df4.copy()
# this line create another if column instead of uuid column 
df5['id'] = range(1, len(df5) + 1)

In [24]:
df5.drop('uuid',axis='columns',inplace=True)

In [25]:
df5=df5[['id','title','language','categories','description']]


In [26]:
# spliting the categories by | and return them as list, so categories columns will contain list of posdcast's categories
df6 = df5.copy()
df6['categories']=df6.categories.apply(lambda val: val.split('|'))

#For every row in the dataframe, iterate through the list of category and place a 1 into the corresponding column
for index, row in df6.iterrows():
    for cat in row['categories']:
        df6.at[index, cat.strip()] = 1
#Filling in the NaN values with 0 to show that a podcast doesn't have that column's category
df6 = df6.fillna(0)
df6=df6.drop('categories',axis=1)
df6.head()

Unnamed: 0,id,title,language,description,technology,tech news,podcasting,business news,business,gadgets,...,language courses,sexuality,educational technology,automotive,amateur,fashion,beauty,aviation,islam,regional
0,1,"Ecommerce Conversations, by Practical Ecommerce",English,Listen in as the Practical Ecommerce editorial...,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Eat Sleep Code Podcast,English,On the show we’ll be talking to passionate peo...,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,SoundtrackAlley,English,A podcast about soundtracks and movies from my...,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,The Tech M&A Podcast,English,The Tech M&A Podcast pulls from the best of th...,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,"The Tech Informist - For fans of Apple, Google...",English,The tech news show with two guys shooting the ...,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Content-Based recommendation system

In [27]:
#building recommendation system
# Let's begin by creating an input user to recommend podcasts to
user_data = [
    { 'title':'Faith Covenant Church Podcast','rating':4},
    { 'title':'What We Do','rating':2},
    { 'title':'Stark Reflections on Writing and Publishing','rating':5},
]
input_podcasts=pd.DataFrame(user_data)
input_podcasts

Unnamed: 0,title,rating
0,Faith Covenant Church Podcast,4
1,What We Do,2
2,Stark Reflections on Writing and Publishing,5


In [28]:
#Filtering out the podcasts from the input
#We're going to start by learning the input's preferences, 
#so let's get the subset of pods that the input has watched from the Dataframe 
#containing categories defined with binary values.
userPodcasts = df6[df6['title'].isin(input_podcasts['title'].tolist())]
userPodcasts


Unnamed: 0,id,title,language,description,technology,tech news,podcasting,business news,business,gadgets,...,language courses,sexuality,educational technology,automotive,amateur,fashion,beauty,aviation,islam,regional
52105,40813,What We Do,English,"""It isn't what we say or think that defines us...",0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
90611,70543,Stark Reflections on Writing and Publishing,English,Perspectives and reflections on the writing an...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103606,80563,Faith Covenant Church Podcast,English,"Faith Covenant Church in St. Petersburg, FL. ...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#We'll only need the actual categories table, so let's clean this up a bit 
#by  dropping the 'id','title','language','description' columns.
userPodcasts.drop(['id','title','language','description'],axis='columns',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  userPodcasts.drop(['id','title','language','description'],axis='columns',inplace=True)


In [30]:
#Dot produt to get weights
userProfile = userPodcasts.transpose().dot(input_podcasts['rating'].values)
#The user profile
userProfile

technology       0.0
tech news        0.0
podcasting       0.0
business news    0.0
business         4.0
                ... 
fashion          4.0
beauty           4.0
aviation         0.0
islam            0.0
regional         0.0
Length: 78, dtype: float64

In [31]:
# Now, we have the weights for every of the user's 
# preferences. This is known as the User Profile. 
# Using this, we can recommend podcasts that satisfy the user's preferences.
#Now let's get the categories of every podcast in our original dataframe
#And drop the unnecessary information
podcasts_categories=df6.set_index(df6['id'])
podcasts_categories = podcasts_categories[~podcasts_categories['title'].isin(input_podcasts['title'].tolist()) ]
podcasts_categories=df6.drop(['id','title','description','language'],axis='columns')
podcasts_categories.head()

Unnamed: 0,technology,tech news,podcasting,business news,business,gadgets,management,marketing,news,politics,...,language courses,sexuality,educational technology,automotive,amateur,fashion,beauty,aviation,islam,regional
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# With the input's profile and the complete list of pods 
# and their categories in hand, we're going to take the weighted 
# average of every pod based on the input profile and recommend the top twenty  that most satisfy it.
recommendationTable_df = ((podcasts_categories*userProfile).sum(axis=1))/(userProfile.sum())


In [33]:
#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_df.head()

43322     0.698113
112695    0.698113
87241     0.698113
67790     0.679245
76415     0.660377
dtype: float64

# Now here's the top 20 podcast recommendations

In [34]:
#The final recommendation table
recommendation_table = df5.loc[df5['id'].isin(recommendationTable_df.head(20).keys())]
recommendation_table

Unnamed: 0,id,title,language,categories,description
29187,22975,The B.A.R. Podcast,English,religion | spirituality | christianity,The B. A. R. (Biblical and Reformed) podcast p...
29462,23124,Financial Education,English,business,My name is Jeremy and I teach and talk about S...
34972,27431,Hornet Squadron Radio,English,games | hobbies,"Hornet Squadron Radio- Charlotte, NC / X-Wing ..."
43515,34131,U Up?,English,society | culture | comedy | tv | film,"Insightful, obnoxious, and borderline offensiv..."
47359,37376,12 O'Clock High-a Podcast on Business Leadership,English,business | business news | management | marketing,Leadership for business professionals
48059,38043,KWHI.com,English,news | politics,"KWHI 1280 Brenham, Texas"
54925,43322,Handsome Boys Comics Hour,English,arts | visual arts,"One writer, one artist, one weekly discussion ..."
55574,43937,Project Censored,English,news | politics,Mickey Huff is co-host of the Project Censored...
58611,45950,On the Same Page,English,education,Join the Marian University Writing Center as w...
62810,49500,Mindset for Success with Jimmy Petruzzi,English,self-help | health,NLP Centre of Excellence is recognised worldwi...
