In [4]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
ps = PorterStemmer()
def convert_book(book):
    return ["b"+str(book)]
def convert_chapter(chapter):
    return ["chapter"+str(chapter)]
def convert_versets(verse):
    return ["verse"+str(verse)]
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [6]:
df = pd.read_csv("../datasets/t_kjv.csv")
df['b'] = df['b'].apply(convert_book)
df['c'] = df['c'].apply(convert_chapter)
df['v'] = df['v'].apply(convert_versets)
df['t'] = df["t"].apply(lambda x: x.split())

data = pd.DataFrame()
data['labels'] = df['b'] + df['c'] + df['v']
data['tags'] = df['b'] + df['c'] + df['v'] + df['t']

data["labels"] = data["labels"].apply(lambda x: ' '.join(x))
data["tags"] = data["tags"].apply(lambda x: ' '.join(x))
data["tags"] = data["tags"].apply(lambda x: x.lower())
data["tags"] = data["tags"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data["tags"] = data["tags"].apply(stem)

In [7]:
data.to_csv("dataa.csv")

In [6]:
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(data["tags"]).toarray()
DF = pd.DataFrame(vectors)
DF.to_csv("data.csv")

### Recommendation using cosine similarity

In [8]:
vectors = pd.read_csv("data.csv")

In [9]:
similarity = cosine_similarity(vectors)

In [26]:
key_df = pd.read_csv("../datasets/key_english.csv")
def verse_in(fullverse):
    temp = fullverse.split()
    if len(temp) == 4:
        book = " ".join(temp[0:2])
        chap = temp[2]
        verse = temp[3]
    else:
        book, chap, verse = temp
    book = "b" + str(key_df[key_df['n'] == book]["b"].index[0] + 1)
    return " ".join([book, chap, verse])

def verse_out(fullverse):
    book, chap, verse = fullverse.split()
    chap = chap.replace("chapter", "")
    verse = verse.replace("verse", "")
    book = book.replace("b", "")
    book = key_df.loc[key_df['b'] == int(book),'n'].values[0]
    return book + " " + chap + ":" + verse

def recommend(verse, data):
    verse = data[data['labels']==verse_in(verse)]
    if verse.empty:
        return False
    verse_index = verse.index[0]
    distances = similarity[verse_index]
    verse_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:100]
    verse_list = [data.iloc[i[0]].labels for i in verse_list]
    verse_list = [verse_out(i) for i in verse_list]
    return verse_list

In [28]:
recommend("1 John chapter1 verse10", data)

['1 John 1:8',
 '3 John 1:15',
 'Jude 1:22',
 'Colossians 1:14',
 '1 John 1:6',
 '1 John 1:4',
 'Galatians 1:24',
 'Galatians 1:5',
 'Ephesians 1:23',
 'Luke 24:8',
 'Romans 7:20',
 'Romans 7:17',
 'Romans 6:7',
 'Romans 3:10',
 '1 Corinthians 1:29',
 '1 Corinthians 1:15',
 'Jude 1:19',
 '1 John 5:17',
 '1 John 2:10',
 '1 John 2:4',
 '1 John 2:2',
 'James 1:16',
 'Luke 1:62',
 'Hebrews 10:18',
 'Hebrews 10:17',
 'Philemon 1:24',
 'Titus 3:10',
 '2 Timothy 2:17',
 'Philemon 1:10',
 'Philippians 1:30',
 'Philippians 1:24',
 'Philippians 1:5',
 'John 1:24',
 'Ephesians 1:16',
 'Ephesians 1:12',
 'Mark 9:40',
 'Revelation 2:28',
 '2 John 1:11',
 '3 John 1:8',
 '2 John 1:2',
 'Romans 1:31',
 '1 John 3:13',
 'Romans 1:12',
 'Romans 1:6',
 '1 John 2:9',
 '1 John 1:9',
 'Acts 24:9',
 '1 Peter 2:22',
 'Hebrews 13:25',
 'Hebrews 12:29',
 'James 1:8',
 'James 1:18',
 'Hebrews 7:7',
 'Hebrews 7:10',
 'Philemon 1:23',
 'Philemon 1:2',
 'Titus 1:10',
 '1 Thessalonians 5:25',
 '1 Thessalonians 5:24',