In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from imdb import IMDb

In [2]:
# Get data frame with data from movies
df = pd.read_csv('movies.csv', sep=',')

In [3]:
# Save only western movies to new dataframe
western_movies = df.loc[df['genres'] == 'Western']

In [4]:
# Get movie IDs
western_titles = western_movies.as_matrix(columns=df.columns[1:2])

In [5]:
# Search movies with IMDBpy crawling tool
ia = IMDb()
movieIDs = []
for title in western_titles:
    movies = ia.search_movie(title.item(0))
    if(len(movies) > 0):
        movieIDs.append(movies[0].movieID)

In [6]:
es = Elasticsearch()

In [7]:
es.indices.create(index="movie-index", ignore=400)

{u'error': {u'index': u'movie-index',
  u'index_uuid': u'WkykToMrT9GaNj7sqywbRw',
  u'reason': u'index [movie-index/WkykToMrT9GaNj7sqywbRw] already exists',
  u'root_cause': [{u'index': u'movie-index',
    u'index_uuid': u'WkykToMrT9GaNj7sqywbRw',
    u'reason': u'index [movie-index/WkykToMrT9GaNj7sqywbRw] already exists',
    u'type': u'index_already_exists_exception'}],
  u'type': u'index_already_exists_exception'},
 u'status': 400}

In [8]:
# Index all the movies in ElasticSearch
ia = IMDb()
for i in range(1, len(movieIDs)):
    movie = ia.get_movie(movieIDs[i])
    es.index(index='movie-index', doc_type='movie', body={
        'id': i,
        'title': movie.get('title'),
        'plot': movie.get('plot outline')
    })
        

In [9]:
# Query test
es.search(index='movie-index', body={"query": {"match": {'title':'Dakota'}}})

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'AVymQpthE-HKshLZMdsI',
    u'_index': u'movie-index',
    u'_score': 5.6424317,
    u'_source': {u'id': 31,
     u'plot': u'In 1871 Dakota, two crooked businessmen oppose the local wheat farmers and the railroad development, in order to control the town of Fargo.',
     u'title': u'Dakota'},
    u'_type': u'title'},
   {u'_id': u'AVymY4vEE-HKshLZMdvd',
    u'_index': u'movie-index',
    u'_score': 5.6424317,
    u'_source': {u'id': 31,
     u'plot': u'In 1871 Dakota, two crooked businessmen oppose the local wheat farmers and the railroad development, in order to control the town of Fargo.',
     u'title': u'Dakota'},
    u'_type': u'movie'}],
  u'max_score': 5.6424317,
  u'total': 2},
 u'timed_out': False,
 u'took': 2}

In [10]:
# Show first three items with best score
es.search(index='movie-index', body={"query": {"fuzzy" : { "title" : "bil"}}}, size=3)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'AVymaX5CE-HKshLZMdx4',
    u'_index': u'movie-index',
    u'_score': 2.6259067,
    u'_source': {u'id': 186,
     u'plot': u'The story of William "Buffalo Bill" Cody, legendary westerner, from his days as an army scout to his later activities as owner of a Wild West show.',
     u'title': u'Buffalo Bill'},
    u'_type': u'movie'},
   {u'_id': u'AVymY_bKE-HKshLZMdvp',
    u'_index': u'movie-index',
    u'_score': 2.5822492,
    u'_source': {u'id': 43,
     u'plot': u"In 1909, when John Fain's gang kidnaps Big Jake McCandles' grandson and hold him for ransom, Big Jake sets out to rescue the boy.",
     u'title': u'Big Jake'},
    u'_type': u'movie'},
   {u'_id': u'AVymQyKmE-HKshLZMdsU',
    u'_index': u'movie-index',
    u'_score': 2.5733304,
    u'_source': {u'id': 43,
     u'plot': u"In 1909, when John Fain's gang kidnaps Big Jake McCandles' grandson and hold him for ransom, Big Jake sets out to