In [1]:
import pandas as pd
import numpy as np
import os
import glob
import nltk.data
from __future__ import division  # Python 2 users only
import nltk, re, pprint
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
%matplotlib inline

In [2]:
translations = glob.glob('/Users/sheldon/completed_podcasts/*/*.txt')

In [3]:
translations = filter(lambda x: 'DONE' not in x, translations)
translations = filter(lambda x: 'speech_notebook' not in x, translations)

In [4]:
translations

['/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens040516_cms591992_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens040716_cms592303_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens041216_cms593915_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens041916_cms596223_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens042616_cms598229_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens050316_cms600651_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens051016_cms602605_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens051716_cms619373_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/2DopeQueens/dopequeens052416_cms621397_pod.mp3_translation.txt',
 '/Users/sheldon/completed_podcasts/CriminalShow/Criminal.Ep14.FifthSuspectFINAL.mp3_transl

In [5]:
episode = [i.split('/')[5] for i in translations]
series = [i.split('/')[4] for i in translations]
locations = translations
transcribed = [open(i).read() for i in translations]

In [6]:
df = pd.DataFrame(data={'episode':episode,'series':series,'locations':locations,'transcribed':transcribed})

In [7]:
df['id'] = df.index

In [132]:
stop = set(stopwords.words('english'))

def tokenize_and_lower(textfile):
    tokens = word_tokenize(textfile)
    lower = [w.lower() for w in tokens]
    filtered_words = [word for word in lower if word not in stop]
    remove_contractions = [word for word in filtered_words if "'" not in word]
    remove_periods = [word for word in remove_contractions if "." not in word]
    count = Counter(remove_periods)
    return count
    
#df['trans_token'] = df.transcribed.apply(tokenize_and_lower)
df['removed_stop_transcribed'] = df.transcribed.apply(tokenize_and_lower)
tf = TfidfVectorizer(stop_words=stop)
tfidf_matrix = tf.fit_transform(df['transcribed'])

In [133]:
tfidf_matrix

<151x35028 sparse matrix of type '<type 'numpy.float64'>'
	with 264535 stored elements in Compressed Sparse Row format>

In [134]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [143]:
def get_related_podcasts(podcast_number,number_of_similarities):
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
    related_pod_index = cosine_similarities.argsort()[podcast_number][::-1]
    pod_dict = dict(zip(range(0, len(related_pod_index)),related_pod_index))
    pod_dict = pd.DataFrame({'rank':pod_dict.keys()},index=pod_dict.values())
    related_podcasts_df = pd.DataFrame.join(pod_dict, df, how='inner')
    final_df = related_podcasts_df.sort_values('rank')[0:number_of_similarities+1][['rank','episode','series']]
    return final_df

def get_related_podcasts_query(query, number_of_similarities):
    query = query.lower()
    query = query.split()
    tfidf_matrix_test = tf.fit_transform(query)
    tfidf_matrix_train = tf.transform(df['transcribed'])
    tfidf_matrix_train.todense()
    tfidf_matrix_test.todense()
    query_similarities = linear_kernel(tfidf_matrix_test, tfidf_matrix_train)
    query_similarities = query_similarities.argsort()[0][::-1]
    pod_dict = dict(zip(range(0, len(query_similarities)),query_similarities))
    pod_dict = pd.DataFrame({'rank':pod_dict.keys()},index=pod_dict.values())
    related_podcasts_df = pd.DataFrame.join(pod_dict, df, how='inner')
    final_df = related_podcasts_df.sort_values('rank')[0:number_of_similarities+1][['rank','episode','series']]
    return final_df

In [144]:
get_related_podcasts_query('economics math statistics',5)

Unnamed: 0,rank,episode,series
20,0,freakonomics_mppodcast112812.mp3_translation.txt,freakonomics
53,1,4399611-4-6-16-mark-levin-audio-rewind.mp3_tra...,MLAR
22,2,freakonomics_podcast010213.mp3_translation.txt,freakonomics
21,3,freakonomics_mppodcast121212.mp3_translation.txt,freakonomics
18,4,freakonomics_mppodcast100511.mp3_translation.txt,freakonomics
27,5,freakonomics_podcast061115.mp3_translation.txt,freakonomics


In [122]:
get_related_podcasts(17,5)

Unnamed: 0,rank,episode,series
17,0,freakonomics_mppodcast071012.mp3_translation.txt,freakonomics
148,1,WS030416.mp3_translation.txt,wrestling_soup
150,2,WS091715.mp3_translation.txt,wrestling_soup
32,3,freakonomics_podcast111815.mp3_translation.txt,freakonomics
135,4,05-Tim_Ferriss_Show-Chase_Jarvis_128.mp3_trans...,thetimferrissshow
149,5,WS052616.mp3_translation.txt,wrestling_soup


## Compute for queries

In [124]:
query = ['python tim ferris']
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_test = tf.fit_transform(query)
tfid_matrix_train = tfidf_matrix.todense()
tfidf_matrix_test.todense()
cosine_similarities = linear_kernel(tfidf_matrix_test, tfidf_matrix_train)
cosine_similarities= cosine_similarities.argsort()[::-1]
cosine_similarities

array([[  0,  98,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,  87,
         86,  85,  83,  82,  81,  80,  79,  78,  77,  76, 149,  74,  73,
         72,  71,  70,  69,  99, 100, 101, 102, 146, 145, 144, 143, 142,
        138, 133, 132, 131, 129, 128, 127, 126, 125,  68, 124, 122, 120,
        117, 115, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 123,
         67,  75,  65,  30,  29,  28,  27,  26,  66,  24,  23,  22,  21,
         20,  19,  18,  17,  16,  15,  14,  13,  12,  11,  10,   9,   8,
          7,   6,   5,   4,   2,   1,  33,  34,  25,  36,  57,  58,  49,
         59,  60,  48,  61,  47,  53,  46,  56,  44,  43,  64,  41,  40,
         39,  38,  37,  63,  62, 141, 134, 147, 148, 130,   3,  52,  54,
         32,  31,  42,  45, 113, 114, 116, 118,  55, 150,  84,  50,  51,
         35, 119, 136, 135, 140, 137, 121, 139]])