In [355]:
import json
from elasticsearch import Elasticsearch
import elasticsearch
from IPython.display import HTML, display
from pathlib import Path
import pandas as pd
import re
import random
import time

PATH = str(Path.cwd().resolve())

def read_csv(path):
    return pd.read_csv(path)

def print_files(path=PATH):
    from os import listdir
    from os.path import isfile, join
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]

class GenerateFromList:
    def __init__(self, l):
        self.counter = 0
        self.arr = l
    def __iter__(self):
        return self
    def __next__(self):
        value = self.arr[self.counter]
        if self.counter > len(self.arr)-1:
            raise StopIteration
        else:
            self.counter += 1
            return value

In [43]:
# GET INPUT DATA
input_data_mentors = read_csv(PATH + "/data/input/mentors.csv")
input_data_startups = read_csv(PATH + "/data/input/startups.csv")

In [44]:
input_data_startups.keys(), input_data_mentors.keys()

(Index(['acronym', 'gmt', 'description', 'social_problem', 'exact_problem'], dtype='object'),
 Index(['mentor_id', 'gmt', 'first_pref', 'second_pref', 'third_pref',
        'fourth_pref', 'fifth_pref', 'sixth_pref', 'seventh_pref',
        'eighth_pref', 'skills', 'social_problems', 'personal_background'],
       dtype='object'))

In [45]:
input_data_mentors.head(2)

Unnamed: 0,mentor_id,gmt,first_pref,second_pref,third_pref,fourth_pref,fifth_pref,sixth_pref,seventh_pref,eighth_pref,skills,social_problems,personal_background
0,1,0.0,HD,BF,VWMC,,,,,,"Considering my professional experience, I can ...",Sustainability and public policies. I really b...,"First of all, I will just introduce myself. I'..."
1,2,0.0,PIS,BF,KCC,,,,,,"Finance, Project Management, Business Partneri...","Women, Poverty alleviation, Empowering small e...",Motivation - Help facilitate the empowering of...


In [46]:
input_data_startups.head(2)

Unnamed: 0,acronym,gmt,description,social_problem,exact_problem
0,BF,-4,Pruda Integrated Solutions Int'l Ltd is a heal...,Bridging the gap in quality health solutions b...,Need to adopt a sustainable financing model to...
1,HD,0,Reduce St. Lucia s overall food-import bill by...,Helen's Daughters seeks to reduce St. Lucia's ...,The concept of social enterprise in the Easter...


### Load into Elasticsearch


In [47]:
# Connect elasticsearch
# ! curl -XGET host.docker.internal:9200
def connect_elasticsearch(protocol='http', host='host.docker.internal', port=9200):
    _es = None
    _es = Elasticsearch(f"{protocol}://{host}:{port}")
    if _es.ping():
        _es.info(pretty=True)
        print('Elasticsearch connected')
    else:
        print('Elasticsearch NOT connected')
    return _es

print(elasticsearch.__version__)

es = connect_elasticsearch()

(8, 2, 0)
Elasticsearch connected


In [48]:
mentors_mapping = {
    "index": "mentors",
    "mappings": {
        "properties": {
            "social_problems": {
                "type": "text"
            },
            "personal_background": {
                "type": "text"
            },
            "skills": {
                "type": "text"
            },
            "mentor_id": {
                "type": "text"
            }
        }
    } 
}

startups_mapping = {
    "index": "startups",
    "mappings": {
        "properties": {
            "social_problem": {
                "type": "text"
            },
            "exact_problem": {
                "type": "text"
            },
            "description": {
                "type": "text"
            },
            "acronym": {
                "type": "text"
            }
        }
    }
}

indices = [
    {
        "name": "startups",
        "data": input_data_startups,
        "mapping": startups_mapping
    },
    {
        "name": "mentors",
        "data": input_data_mentors,
        "mapping": mentors_mapping
    },  
]

# Delete and create indices
for ind in indices:
    es.indices.delete(index=ind['name'], ignore=[400, 404])
    es.indices.create(**ind['mapping'])

  es.indices.delete(index=ind['name'], ignore=[400, 404])


In [49]:
def store_record(name, record):
    es.index(
        index=name,
        document=record
    )

# Load startups data into elasticsearch
for _, record in indices[0]['data'].iterrows():
    store_record('startups', record.to_dict())

# Load all data into elasticsearch
# for _, datum in ind['data'].iterrows():
#     print(ind['name'], datum.to_dict())
#     es.index(
#         index=ind['name'],
#         document=datum.to_dict()
#     )

# Matching Engine

In [103]:
matching_criteria = [
    {
        "startups": "description",
        "mentors": "personal_background"
    },
    {
        "startups": "social_problem",
        "mentors": "social_problems"
    },
    {
        "startups": "exact_problem",
        "mentors": "skills"
    }
]

## TF-IDF

In [None]:
Yes, it returns you a tf - term frequency (you had both term frequency for this field, and ttf - which is total term frequency, e.g. sum of all tf's across all fields) and df - document frequency (you also had it in the response). You need to decide which tf-idf you want to calculate across only your field, or all fields. To compute tf-idf you need to do the following:

N = doc_count
idf = log (N / df)
tf_idf = tf * idf

## BM25

In [79]:
# Example of single query without tuning

# Source: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html
# Resources:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-boosting-query.html
# https://www.elastic.co/guide/en/app-search/current/relevance-tuning-guide.html?baymax=rec&rogue=rec-1&elektra=guide

matching_startups = es.search(index="startups", body={
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "social_problem": {
              "query": "Sustainability and public policies. I really believe that we need to improve our process, independent of which sector the company is focused. We have a huge opportunity to do more using less, amplifying the quality of the services and how we interact with our natural capital."
            }
          }
        },
        {
          "match": {
            "exact_problem": "Considering my professional experience, I can contribute with any public interactions that the startups have to deal, such as licensing, taxation or public funding. Also, if the area of the company underpins with participatory actions (policies, development, social business), I believe that my backgrounds can help a new point of view about their practices. Furthermore, my specialisation in business management allows me to give advice in any area related to strategy, basic finances and human resources management, canvas, business plan, etc. To finish, the development of my Masters already permits me support any company that works with sustainability, energy and related areas."
          }
        },
        {
          "match": {
            "description": "First of all, I will just introduce myself. I'm Brazilian, 26 years old and doing an MSc in Economics, Policy of Energy and the Environment at UCL. I'm also a Chevening scholar , a programme created by the UK Government to allow leaders from developing countries doing Masters in the UK with the focus of fostering the growth of their areas after concluding the studies. \r\n\r\nMy backgrounds are in Public Administration, in which I have a bachelor, and Business Management, in which I have a specialisation. As a specialist in public policies and governmental management, I have worked for 4 years in my state government. Considering that my career was created to be a generalist expert in public policies, I had the opportunity to work in many areas, such as education, culture, economic development and participatory policies. In addition, I was volunteer in AIESEC (a platform to develop young leadership through exchanges) which helped me to improve my skills as a manager. I believe that I have the potential to help the Bridges for Enterprises with my past experience and knowledge. Although my professional background is in the public sector, I had the possibility to work with many companies in the private sector, and a fast experience in the startup programme of my state (SEED-MG). My willingness is to help the companies to foster their development, adding my expertise, and also learning with them new models of business."
          }
        }
      ]
    }
  }
}, size=10)

pd.DataFrame(dict(matching_startups)['hits']['hits']).head(2)

  matching_startups = es.search(index="startups", body={


Unnamed: 0,_index,_id,_score,_source
0,startups,VmHRloEBzbWb8Ney4FP8,55.850548,"{'acronym': 'PEA', 'gmt': -4, 'description': '..."
1,startups,U2HRloEBzbWb8Ney4FPl,54.982094,"{'acronym': 'HD', 'gmt': 0, 'description': 'Re..."


In [484]:
def generate_tf_idf_matches(query_index_data, query_index_name, index_name, size=10, matching_criteria=matching_criteria):
    shouldQuery = []
    
    for criterion in matching_criteria:
        shouldQuery.append({
            "match": {
                criterion[index_name]: {
                    "query": query_index_data[criterion[query_index_name]]
                }
            }
        })
    return { "bool": { "should": shouldQuery } }

def startups_for_mentor(mentor):
    return generate_matches(query_index_data=mentor, query_index_name='mentors', index_name="startups")

def get_matching_startups_list(query):
    return es.search(index="startups", body={ "query": query })

In [485]:
startups_queries = []
for _, mentor in input_data_mentors.iterrows():
    startups_queries.append((mentor['mentor_id'], startups_for_mentor(mentor)))
iterator = GenerateFromList(startups_queries)

In [486]:
mentor = iterator.__next__()
matching_startups = get_matching_startups_list(mentor[1])
hits = pd.DataFrame(dict(matching_startups)['hits']['hits'])
print("mentor_id", mentor[0])
for idx, x in enumerate(hits['_source']):
    score = hits['_score'][idx]
    print('acronym:', x['acronym'], 'score:', score)
hits.head()

mentor_id 1
acronym: PEA score: 55.850548
acronym: HD score: 54.982094
acronym: VWMC score: 48.646976
acronym: KCC score: 39.83787
acronym: LGI score: 39.393284
acronym: BF score: 34.593166
acronym: PIS score: 34.593166


  return es.search(index="startups", body={ "query": query })


Unnamed: 0,_index,_id,_score,_source
0,startups,VmHRloEBzbWb8Ney4FP8,55.850548,"{'acronym': 'PEA', 'gmt': -4, 'description': '..."
1,startups,U2HRloEBzbWb8Ney4FPl,54.982094,"{'acronym': 'HD', 'gmt': 0, 'description': 'Re..."
2,startups,WGHRloEBzbWb8Ney4VMH,48.646976,"{'acronym': 'VWMC', 'gmt': 9, 'description': '..."
3,startups,VGHRloEBzbWb8Ney4FPv,39.83787,"{'acronym': 'KCC', 'gmt': -4, 'description': '..."
4,startups,VWHRloEBzbWb8Ney4FPz,39.393284,"{'acronym': 'LGI', 'gmt': 9, 'description': 'T..."


## Vector Similarity

## Semantic search

Traditional keyword search does not consider lexical variants or conceptual matches to the user’s search phrase. If the precise wording used in the query cannot be found in the overall content, incorrect results would be provided to the user.

if we wanted to find similar documents based on something more abstract - like the meaning of a word or the style of writing? This is where Elasticsearch's dense vector field datatype, and script-score queries for vector fields come into play.

In [None]:
# FAISS - search for similar vectors, which are closest to eachother in terms of cosine similarity
# https://blog.accubits.com/vector-similarity-search-using-elasticsearch/
VECTOR_DIM = 20

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('test').getOrCreate()

ratings_from_es = spark.read.options(inferSchema=True, header=True).csv("ratings.csv")
ratings_from_es.show(5)

als = ALS(userCol="userId", itemCol="contentId", ratingCol="rating", regParam=0.02, rank=VECTOR_DIM, seed=54)
model = als.fit(ratings_from_es)
model.userFactors.show(5)
model.itemFactors.show(5)

In [None]:
from pyspark.sql.functions import lit, current_timestamp, unix_timestamp

ver = model.uid
ts = unix_timestamp(current_timestamp())
content_vectors = model.itemFactors.select("id",\
                                         col("features").alias("model_factor"),\
                                         lit(ver).alias("model_version"),\
                                         ts.alias("model_timestamp"))
content_vectors.show(5)
user_vectors = model.userFactors.select("id",\
                                        col("features").alias("model_factor"),\
                                        lit(ver).alias("model_version"),\
                                        ts.alias("model_timestamp"))
user_vectors.show(5)

In [13]:
df_rows = [row.asDict() for row in content_vectors.collect()]
# Make into bulk query
for row in df_rows:
    search_result = es.search(index='content', query={'match': {'contentId': row['id']}})
    hits = search_result['hits']['hits']
    search_id = hits[0]['_id']
    es.update(index='content', id=search_id, doc={'model_factor': row['model_factor']})

# content_vectors.write.format("csv")
# .option("es.write.operation", "index") \
# .save("users", mode="append")


In [None]:
user_df_rows = [row.asDict() for row in user_vectors.collect()]
for row in user_df_rows:
    search_result = es.search(index='users', query={'match': {'userId': row['id']}})
    hits = search_result['hits']['hits']
#     print(row, hits)
    search_id = hits[0]['_id']
    es.update(index='users', id=search_id, doc={'model_factor': row['model_factor']})

In [14]:
from IPython.display import Image, HTML, display
    
def vector_query(query_vec, vector_field, q="*", cosine=False):
    if cosine:
        score_fn = "doc['{v}'].size() == 0 ? 0 : cosineSimilarity(params.vector, '{v}') + 1.0"
    else:
        score_fn = "doc['{v}'].size() == 0 ? 0 : sigmoid(1, Math.E, -dotProduct(params.vector, '{v}'))"
       
    score_fn = score_fn.format(v=vector_field, fn=score_fn)
    
    return {
    "query": {
        "script_score": {
            "query" : { 
                "query_string": {
                    "query": q
                }
            },
            "script": {
                "source": score_fn,
                "params": {
                    "vector": query_vec
                }
            }
        }
    }
}


def get_content_recommendations(content_id, q="*", num=10, index="content", vector_field='model_factor'):
    response = es.get(index=index, id=content_id)
    src = response['_source']
    if vector_field in src:
        query_vec = src[vector_field]
        q = vector_query(query_vec, vector_field, q=q, cosine=True)
        results = es.search(index=index, body=q)
        hits = results['hits']['hits']
        return src, hits[1:num+1]

def get_collaborative_recommendations(user_id, q="*", num=10, users="users", content="content", vector_field='model_factor'):
    response = es.get(index=users, id=user_id)
    src = response['_source']
    if vector_field in src:
        query_vec = src[vector_field]
        q = vector_query(query_vec, vector_field, q=q, cosine=False)
        results = es.search(index=content, body=q)
        hits = results['hits']['hits']
        return src, hits[:num]

def get_content_for_user(the_id, num=10, ratings="ratings", content="content"):
    response = es.search(index=ratings, q="userId:{}".format(the_id), size=num, sort=["rating:desc"])
    # Get content to display from database


### Test recommender model on ratings

In [33]:
       
def display_user_recs(the_id, q="*", num=10, num_last=10, users="users", content="content", ratings="ratings"):
    user, recs = get_collaborative_recommendations(the_id, q, num, users, content)
    rec_html = "<div><h1>Collaborative filtering</h1></div><table><tr>"
    rec_html += '<tr><td><h5>%s</h5></td></tr><tr>'% (user['userId'])
    i = 0
    for rec in recs:
        r_score = rec['_score']
        r_title = rec['_source']['title']
        rec_html += "<td><h5>%s</h5></td><td><h5>%2.3f</h5></td>" % (r_title, r_score)
        rec_html += "<td><h5>%s</h5></td><td><h5>%s</h5></td>" % (rec['_id'], rec['_source']['contentId'])
        i += 1
        if i % 2 == 0:
            rec_html += "</tr><tr>"
    rec_html += "</tr></table>"
    display(HTML(rec_html))
    
def display_similar(the_id, q="*", num=10, content="content"):
    content, recs = get_content_recommendations(the_id, q, num, content)
    sim_html = "<div><h1>Collaborative filtering</h1></div><table><tr>"
    sim_html += '<tr><td><h5>%s</h5></td></tr><tr>'% (content['contentId'])
    i = 0
#     print(content, recs)
    while i < len(recs):
        rec = recs[i]
#         print('rec', rec)
        r_score = rec['_score']
        r_title = rec['_source']['title']
        sim_html += "<td><h5>%s</h5></td><td><h5>%2.3f</h5></td>" % (r_title, r_score)
        sim_html += "<td><h5>%s</h5></td><td><h5>%s</h5></td>" % (rec['_id'], rec['_source']['contentId'])
#         sim_html += "<td></td><td><h5>%s</h5></td>" % (rec['_source']['categories'])
        i += 1
        if i % 2 == 0:
            sim_html += "</tr><tr>"
    sim_html += "</tr></table>"
    
    sim_html += "<div>%s</div>" % (content)
    display(HTML(sim_html))

In [None]:
# Content-based recommendations
# display_similar("c64DHIEB7HVHvMB2kNDl")

In [34]:
# Collaborative recommendations
display_user_recs("VMpcHIEBpvDZv_h5-acx")

  results = es.search(index=content, body=q)


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
10,,,,,,,
"Addiction Anxiety, Anger or Stress Anxiety, Anger or Stress Debt or Looking",0.997,rMpSHIEBpvDZv_h5xYKg,1188.0,Victim Or Involved In Crime Relationship or Bereavement Relationship or Bereavement Addiction Addiction Suicide,0.997,YcpSHIEBpvDZv_h554jC,4232.0
"Relationship Or Bereavement Parenting Body image or Eating disorders Anxiety, Anger or Stress Relationship or",0.995,9spSHIEBpvDZv_h53Ybg,1021.0,Sleeping Problems Debt or Looking for Employment Victim or Involved in Crime Debt or Looking for Employment Victim or Involved in Crime Addiction,0.995,gcpSHIEBpvDZv_h5z4SZ,45668.0
"Sleeping Problems Low, Depressed or Lonely Low, Depressed or Lonely LGBTQ+ or Gender identity Victim or Involved",0.994,PspTHIEBpvDZv_h5Eo5v,4069.0,Victim Or Involved In Crime Body image,0.994,z8pTHIEBpvDZv_h5Tpje,2919.0
Parenting Childhood abuse,0.994,yMpTHIEBpvDZv_h5EI0R,6385.0,Lgbtq+ Or Gender Identity Debt or Looking for Employment,0.994,58pSHIEBpvDZv_h50oSW,71579.0
Addiction Victim or Involved in Crime Childhood abuse or Bullying Addiction Sleeping problems Suicide,0.993,x8pSHIEBpvDZv_h50YTZ,8869.0,Lgbtq+ Or Gender Identity Victim or Involved in Crime LGBTQ+,0.993,f8pSHIEBpvDZv_h5xIK5,222.0
,,,,,,,


In [None]:
# search_after
# res = es.search(index='content', query={"match_all": {}}, size=999)
# res["hits"]["hits"]

## Evaluation

In [None]:
# # Build the recommendation model using ALS on the training data
# # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
# als = ALS(maxIter=2, regParam=0.01, 
#           userCol="user_id", itemCol="primary_video_id", ratingCol="watching_percentage",
#           coldStartStrategy="drop",
#           implicitPrefs=False) #changed param!
# model = als.fit(training)

# # Evaluate the model by computing the RMSE on the test data
# predictions = model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="watching_percentage",
#                                 predictionCol="prediction")

# rmse = evaluator.evaluate(predictions)
# print("Root-mean-square error = " + str(rmse))

In [None]:
# GET REFERENCE DATA
reference_data_mentors = read_csv(PATH + "/data/reference/reference_mentors.csv")
reference_data_mentors_2 = read_csv(PATH + "/data/reference/reference_mentors_2.csv")

reference_data_startups = read_csv(PATH + "/data/reference/reference_startups.csv")
reference_data_startups_2 = read_csv(PATH + "/data/reference/reference_startups_2.csv")