In [4]:
import requests
import logging
import json
import os
import boto3
from elasticsearch import Elasticsearch, RequestsHttpConnection
from pprint import pprint
import pandas as pd

In [6]:
host = os.environ["AWS_ENDPOINT"]
region = os.environ["REGION"]

In [7]:
es = Elasticsearch(
    hosts=[host],
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [None]:
# What I am trying to do:
# Find a way to track how well our queries are doing
# Idea: Plot the number of relevant 

# first of all, relevancy is objective. Relevancy for any specific
# user will vary depending on what is most important to them. The jobs
# that I find relevant will be different to, for example, the jobs that
# Robert finds relevant because we might have different standards about
# the location, different skills, etc. So even if we, two people who are 
# both Lambda School graduates, do the same search, we might have two
# very different results for the number of relevant positions. 

In [8]:
def reformat(response_query):
    """
    Reformats elasticsearch query to remove extra information
    """

    data = list()
    for hit in response_query["hits"]["hits"]:
        data.append(
            {
                "id": hit["_id"],
                "source_url": hit["_source"]["post_url"],
                "title": hit["_source"]["title"],
                "company": hit["_source"]["company"],
                "description": hit["_source"]["description"],
                "date_published": hit["_source"]["publication_date"],
                "location_city": hit["_source"]["location_city"],
                "location_state": hit["_source"]["location_state"],
                "geo_locat": hit["_source"]["location_point"],
            }
        )

    logging.info(f"Reformatted {len(data)} returned responses")

    return {"jobs": data}

In [None]:
# OLD QUERY
def search_city_state(search, city, state):
    """
    Query to call if user specifies the location 
    they want to search in. 
    
    Currently using "should" clause, so the locations 
    do not HAVE to match up-
    will change this later when we get more jobs in.
    """

    query = json.dumps(
        {
            "query": {
                "bool": {
                "must": [
                    {
                    "multi_match": {
                    "query": search,
                    "fields": ["description, ", "title"]
                    }
                    }],
                "should": [
                    {
                    "match": {
                        "location_city": city
                    }
                    },
                    {
                    "match": {
                        "location_state": state
                    }
                    }
                ]
                }
            }
        }
    )
    response = connect(query)
    reformatted = reformat(response)

    return reformatted

In [13]:
old_data = {
"san_francisco": {"score": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 "reason": ["n/a", "location", "location", "location", "location", "experience", "experience", "experience", "experience", "experience"]},
"seattle": {"score": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                 "reason": ["n/a", "n/a", "location", "location", "location", "experience", "experience", "experience", "experience", "experience"]},
"portland": {"score": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 "reason": ["n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"]}}

In [14]:
old_san = pd.DataFrame(old_data["san_francisco"])
old_sea = pd.DataFrame(old_data["seattle"])
old_por = pd.DataFrame(old_data["portland"])

In [15]:
old_san['location'] = "San Francisco"
old_sea['location'] = "Seattle"
old_por['location'] = "Portland"

In [16]:
old_df = pd.concat([old_san, old_sea, old_por])

In [15]:
# NEW QUERY

def search_city_state(search, city, state):
    """
    Query to call if user specifies the location 
    they want to search in. 
    
    Job posting MUST match the location, and then
    its relevancy score is increased as more search
    terms are in the description, title, or tags. 

    Job postings are penalized if they have lead, master,
    or senior in the title.
    """

    query = json.dumps(
        {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"location_city": city.title()}},
                        {"match": {"location_state": state.title()}},
                    ],
                    "should": [
                        {
                            "multi_match": {
                                "query": search,
                                "fields": ["description", "title", "tags"],
                            }
                        },
                        {
                            "bool": {
                                "must_not": {"match": {"title": "senior master lead"}}
                            }
                        },
                    ],
                }
            }
        }
    )

    response = es.search(index="jobs", body=query)

    return reformat(response)

In [27]:
# I want to just focus on searching in specific locations right now
# reasons to say NO, not relevant:
    # job doesn't have main search in description or title
    # job not in location specified
    # job needs more than 3+ years of experience in this specific field

In [None]:
search1 = {"search":"Data engineer elasticsearch", "city":"Portland", "state":"Oregon"}

pprint(search_city_state(search=search1['search'], city=search1['city'], state=search1['state']))

In [24]:
print(search1['search'], search1['city'], search1['state'])

Data engineer elasticsearch San Fransisco California


In [5]:
new_data = {
"san_francisco": {"score": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
                 "reason": ["n/a", "n/a", "n/a", "n/a", "n/a", "experience", "experience", "experience", "experience", "experience"]},
"seattle": {"score": [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
                 "reason": ["experience", "experience", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"]},
"portland": {"score": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                 "reason": ["n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"]}}

In [6]:
new_san = pd.DataFrame(new_data["san_francisco"])
new_sea = pd.DataFrame(new_data["seattle"])
new_por = pd.DataFrame(new_data["portland"])

In [7]:
new_san['location'] = "San Francisco"
new_sea['location'] = "Seattle"
new_por['location'] = "Portland"

In [8]:
new_df = pd.concat([new_san, new_sea, new_por])

In [9]:
# Percentage of job searches that are relevant per city
# for the generic search "Data engineer elasticsearch"

new_queries = new_df.groupby(['location']).mean()
new_queries

Unnamed: 0_level_0,score
location,Unnamed: 1_level_1
Portland,1.0
San Francisco,0.5
Seattle,0.8


In [11]:
print(f"The new queries have an accuracy score of {(new_queries['score'].mean() * 100).round()}%")

The new queries have an accuracy score of 77.0%


In [17]:
old_queries = old_df.groupby(['location']).mean()
old_queries

Unnamed: 0_level_0,score
location,Unnamed: 1_level_1
Portland,0.0
San Francisco,0.1
Seattle,0.2


In [19]:
print(f"The old queries have an accuracy score of {(old_queries['score'].mean() * 100).round()}%")

The old queries have an accuracy score of 10.0%


In [20]:
new_queries['RC'] = 1
old_queries['RC'] = 2

In [22]:
total = pd.concat([new_queries, old_queries])

In [46]:
total

Unnamed: 0_level_0,score,RC
location,Unnamed: 1_level_1,Unnamed: 2_level_1
Portland,1.0,1
San Francisco,0.5,1
Seattle,0.8,1
Portland,0.0,2
San Francisco,0.1,2
Seattle,0.2,2


In [None]:
df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})

ax = df.plot.bar(x='lab', y='val', rot=0)

In [41]:
total

Unnamed: 0_level_0,score,RC
location,Unnamed: 1_level_1,Unnamed: 2_level_1
Portland,1.0,1
San Francisco,0.5,1
Seattle,0.8,1
Portland,0.0,2
San Francisco,0.1,2
Seattle,0.2,2
