# POPULAR MOVIE MODEL

In [2]:
import os

# Create 'datasets' folder if not exist
os.makedirs('datasets/imdb', exist_ok=True)

# List content of 'datasets' folder
os.listdir('datasets/imdb')

['title.ratings.tsv', 'title.ratings.tsv.gz']

`Download title.ratings.tsv if not exist; otherwise skip`

In [3]:
import urllib.request
import gzip

file_path = 'datasets/imdb/title.ratings.tsv'

# To download and unzip the rating file
if not os.path.isfile(file_path):
    url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
    urllib.request.urlretrieve(url, f'{file_path}.gz')
    
    with gzip.open(f'{file_path}.gz', 'rb') as compressed_file, open(file_path, 'wb') as decompressed_file:
        decompressed_file.write(compressed_file.read())

os.listdir('datasets/imdb')

['title.ratings.tsv', 'title.ratings.tsv.gz']

## `IMDb non-commercial datasets for personal use`
These datasets are available in gzipped tab-separated-values (TSV) format and can be accessed from [IMDB datasets](https://datasets.imdbws.com/)

1. **title.akas.tsv.gz**
2. **title.basics.tsv.gz**
3. **title.crew.tsv.gz**
4. **title.episode.tsv.gz**
5. **title.principals.tsv.gz**
6. **title.ratings.tsv.gz**
7. **name.basics.tsv.gz**

You can access and download these datasets from the provided link and use them for personal, non-commercial purposes in accordance with IMDb's terms and conditions. Make sure to verify your compliance with their non-commercial licensing and copyright/license terms.

# `Disclaimer of Warranties and Limitation of Liability`
THE IMDB SERVICES AND ALL INFORMATION, CONTENT, MATERIALS, PRODUCTS (INCLUDING SOFTWARE) AND OTHER SERVICES INCLUDED ON OR OTHERWISE MADE AVAILABLE TO YOU THROUGH THE IMDB SERVICES ARE PROVIDED BY IMDB ON AN "AS IS" AND "AS AVAILABLE" BASIS. IMDB MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, AS TO THE OPERATION OF THE IMDB SERVICES OR THE INFORMATION, CONTENT, MATERIALS, PRODUCTS (INCLUDING SOFTWARE) OR OTHER SERVICES INCLUDED ON OR OTHERWISE MADE AVAILABLE TO YOU THROUGH THE IMDB SERVICES. YOU EXPRESSLY AGREE THAT YOUR USE OF THE IMDB SERVICES IS AT YOUR SOLE RISK. IMDB RESERVES THE RIGHT TO WITHDRAW ANY IMDB SERVICE OR DELETE ANY INFORMATION FROM THE IMDB SERVICES AT ANY TIME IN ITS DISCRETION.

TO THE FULL EXTENT PERMISSIBLE BY APPLICABLE LAW, IMDB DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IMDB DOES NOT WARRANT THAT THE IMDB SERVICES, INFORMATION, CONTENT, MATERIALS, PRODUCTS (INCLUDING SOFTWARE) OR OTHER SERVICES INCLUDED ON OR OTHERWISE MADE AVAILABLE TO YOU THROUGH THE IMDB SERVICES, ITS SERVERS, OR ELECTRONIC COMMUNICATIONS SENT FROM IMDB ARE FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. IMDB WILL NOT BE LIABLE FOR ANY DAMAGES OF ANY KIND ARISING FROM THE USE OF ANY IMDB SERVICE, OR FROM ANY INFORMATION, CONTENT, MATERIALS, PRODUCTS (INCLUDING SOFTWARE) OR OTHER SERVICES INCLUDED ON OR OTHERWISE MADE AVAILABLE TO YOU THROUGH ANY IMDB SERVICE, INCLUDING, BUT NOT LIMITED TO DIRECT, INDIRECT, INCIDENTAL, PUNITIVE, AND CONSEQUENTIAL DAMAGES.

CERTAIN STATE LAWS DO NOT ALLOW LIMITATIONS ON IMPLIED WARRANTIES OR THE EXCLUSION OR LIMITATION OF CERTAIN DAMAGES. IF THESE LAWS APPLY TO YOU, SOME OR ALL OF THE ABOVE DISCLAIMERS, EXCLUSIONS, OR LIMITATIONS MAY NOT APPLY TO YOU, AND YOU MIGHT HAVE ADDITIONAL RIGHTS.

IMDb Software Terms In addition to these Conditions of Use, the terms found here apply to any software (including any updates or upgrades to the software and any related documentation) that we make available to you from time to time for your use in connection with IMDb Services (“IMDb Software”). If we provide specific Terms for the IMDb Software and there is a conflict between the specific Terms for the IMDb Software and these Conditions of Use, the specific Terms for the IMDb Software will govern.

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('datasets/imdb/title.ratings.tsv', sep='\t')
df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2004
1,tt0000002,5.8,269
2,tt0000003,6.5,1903
3,tt0000004,5.5,178
4,tt0000005,6.2,2685


In [6]:
for column in df.columns:
    print(column, df[column].isnull().sum())

tconst 0
averageRating 0
numVotes 0


In [7]:
def buildChart(df, percentile=0.8):
    
    # Compute the values of C and m for the filtered movies
    C = df['averageRating'].mean()
    m = df['numVotes'].quantile(percentile)

    # Only consider movies that have higher than m votes
    movies = df.copy().loc[df['numVotes'] >= m]

    # Calculate score using the IMDB formula
    movies['score'] = movies.apply(lambda x: (x['numVotes'] / (x['numVotes'] + m) * x['averageRating']) + (m / (m + x['numVotes']) * C), axis=1)

    # Sort movies in descending order of their scores
    movies = movies.sort_values('score', ascending=False)

    return movies

**TOP 10,000 MOVIES**

In [8]:
movies = buildChart(df)
movies.head()

Unnamed: 0,tconst,averageRating,numVotes,score
908608,tt2301451,10.0,207533,9.997803
1087249,tt4283088,9.9,220320,9.897999
1087251,tt4283094,9.9,156841,9.897189
908610,tt2301455,9.9,134676,9.896727
881790,tt2178784,9.9,114446,9.89615


In [9]:
movies = movies.head(10000)
moviesID = movies['tconst'].tolist()

## [Cinemagoer](https://cinemagoer.github.io/)
`Cinemagoer` (previously known as IMDbPY) is a Python package for retrieving and managing data from the [IMDb](https://www.imdb.com/)
 movie database, including information about movies, people, and companies. 
 Please note that this project and its authors are not affiliated in any way with Internet Movie Database Inc. for details about data licenses, please refer to the [DISCLAIMER](https://raw.githubusercontent.com/cinemagoer/cinemagoer/master/DISCLAIMER.txt) and [DOCUMENTATION](https://readthedocs.org/projects/imdbpy/downloads/pdf/latest/)


In [10]:
try:
    !pip show Cinemagoer
except ImportError:
    !pip install git+https://github.com/cinemagoer/cinemagoer

Name: cinemagoer
Version: 2023.10.22
Summary: Python package to access the IMDb's database
Home-page: https://cinemagoer.github.io/
Author: Davide Alberani
Author-email: da@mimante.net
License: GPL
Location: C:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\site-packages
Requires: lxml, SQLAlchemy
Required-by: 


In [11]:
from imdb import Cinemagoer
imdbClient = Cinemagoer()

import os
os.makedirs('datasets/logs', exist_ok=True)

import logging
logging.basicConfig(filename='datasets/logs/movieScore.log', level=logging.ERROR)

In [12]:
pid = set()

try:
    pid.update(pd.read_csv('datasets/movieScore.csv')['imdbID'])
except FileNotFoundError:
    header = pd.DataFrame(columns=['imdbID', 'Title', 'Genres', 'Plot', 'Directors', 'Writers', 'Actors', 'Language', 'Country', 'Kind', 'Runtime'])
    header.to_csv('datasets/movieScore.csv', index=False)

`imdbID, Title, Genres, Plot, Directors, Writers, Actors, Language, Country, Kind, Runtime`

In [13]:
# Iterate through the list of IMDb IDs
def fetchData(imdbID):
    try:
        movie = imdbClient.get_movie(imdbID[2:])
        if movie:
            # Create a dictionary with the movie data
            movieData = pd.DataFrame([{
                'imdbID': imdbID,
                'Title': movie.get('title', 'N/A'),
                'Genres': ', '.join(movie.get('genres', [])),
                'Plot': ', '.join(movie.get('plot', [])),
                'Directors': ', '.join([director.get('name', '') for director in movie.get('directors', [])]),
                'Actors': ', '.join([actor.get('name', '') for actor in movie.get('cast', [])]),
                'Writers': ', '.join([writer.get('name', '') for writer in movie.get('writer', [])]),
                'Language': ', '.join([language for language in movie.get('language',[])]),
                'Country': ', '.join(movie.get('country', [])),
                'Kind': movie.get('kind', 'N/A'),
                'Runtime': movie.get('runtime', ["N/A"])[0]
            }])

            # Append the movie data to the CSV file
            movieData.to_csv('datasets/movieScore.csv', mode='a', header=False, index=False)

    except Exception as e:
        logging.error(f'IMDB ID: {imdbID} Error: {e}')

In [14]:
from concurrent.futures import ThreadPoolExecutor
from queue import Queue

# A ThreadPoolExecutor to fetch data concurrently
with ThreadPoolExecutor(max_workers=8) as executor:
    queue = Queue()

    for imdbID in moviesID:
        if imdbID not in pid:
            queue.put(imdbID)

    while not queue.empty():
        imdbID = queue.get()
        executor.submit(fetchData, imdbID)

executor.shutdown()

print('Data stored successfully !!!')

2023-11-09 11:35:22,590 CRITICAL [imdbpy] c:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt0702933/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': RemoteDisconnected('Remote end closed connection without response')},); kwds: {}
Traceback (most recent call last):
  File "c:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\site-packages\imdb\parser\http\__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
               ^^^^^^^^^^^^^^^^^
  File "c:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\urllib\request.py", line 525, in open
    response = meth(req, response)
               ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anujj\AppData\Local\Programs\Python\Python311\Lib\urllib\request.py", line 634, in http_response
    response = self.parent.error(
          

In [None]:
moviesData = pd.read_csv('datasets/movieScore.csv')
moviesData.head()

In [None]:
imdbID = 'tt28515477'

In [None]:
movie = imdbClient.get_movie(imdbID[2:])
if movie:
        imdbID =  imdbID,
        Title = movie.get('title', 'N/A'),
        Genres = ', '.join(movie.get('genres', [])),
        Plot = ', '.join(movie.get('plot', [])),
        Directors = ', '.join([director.get('name', '') for director in movie.get('directors', [])]),
        Actors = ', '.join([actor.get('name', '') for actor in movie.get('cast', [])]),
        Writers = ', '.join([writer.get('name', '') for writer in movie.get('writer', [])]),
        Language = ', '.join([language for language in movie.get('language',[])]),
        Country = ', '.join(movie.get('country', [])),
        Kind = movie.get('kind', 'N/A'),
        Runtime = movie.get('runtime',[])[0]