In [98]:
import urllib3
from bs4 import BeautifulSoup
import re
import requests
import numpy as np
import pandas as pd

In [99]:
df = pd.DataFrame(columns=['Title', 'Year', 'Id'])

In [100]:
def get_movies_by_year(year):
    # Initialize urllib3 PoolManager
    http = urllib3.PoolManager()

    # URL of the page to scrape
    url = "en.wikipedia.org/wiki/List_of_Hindi_films_of_"+str(year)

    # Send a GET request to the URL
    response = http.request('GET', 'http://' + url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.data, 'html.parser')

    if(year>2016):
        # Function to check if a style contains 'text-align:center'
        def has_text_align_center(style):
            if style:
                # Split the style into individual rules and strip whitespace
                styles = [s.strip() for s in style.split(';')]
                return 'text-align:center' in styles
            return False

        # Find all tables with class 'wikitable'
        tables = soup.find_all('table', class_='wikitable')
        movie_names=[]
        months=['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
        # Iterate over each table and find the <tr> elements
        for table in tables[2:]:
            rows = table.find_all('tr')
            for row in rows:
                # Find all <td> elements within the row
                columns = row.find_all('td')
                for column in columns:
                    # Check if the style attribute contains 'text-align:center'
                    if has_text_align_center(column.get('style', '')):
                        # Print the text of the <td> element
                        data=column.get_text(strip=True)
                        if (data not in months) and not(data.isdigit()):
                            data=re.sub(r'\[.*?\]', '', data)
                            movie_names.append(data)
    else:
        # Find all tables with the class "wikitable"
        tables = soup.find_all('table', class_='wikitable')
        movie_names=[]
        # Iterate through each table
        for table in tables[1:]:
            # Iterate through each row in the table
            for tr in table.find_all('tr'):
                # Find the first <td> tag in the row that does not have a rowspan attribute
                td = tr.find('td', attrs={'rowspan': None})
                if td:
                    data=td.get_text(strip=True)
                    if not(data.isdigit()):
                        data=re.sub(r'\[.*?\]', '', data)
                        data=data.replace('\n', '')
                        movie_names.append(data)
    return movie_names

In [101]:
def get_movie_id(movie_name):
    url = "https://imdb8.p.rapidapi.com/v2/search"

    querystring = {"searchTerm": movie_name, "type": "MOVIE", "first": "20", "country": "IN", "language": "hi-IN"}

    headers = {
        "x-rapidapi-key": "2e8c10428fmshfa5e9ba94131f3bp172566jsn4715a3f509f3",
        "x-rapidapi-host": "imdb8.p.rapidapi.com"
    }

    try:
        response = requests.get(url, headers=headers, params=querystring)
        response.raise_for_status()
        data = response.json()
        
        # Ensure the data structure exists and is not empty
        if data["data"]["mainSearch"]["edges"]:
            id = data["data"]["mainSearch"]["edges"][0]["node"]["entity"]["id"]
        else:
            id = np.nan
    except (requests.RequestException, KeyError, IndexError):
        id = np.nan

    return id

In [102]:
year_start=2014
year_end=2015
for year in range(year_start, year_end + 1):
    all_movies_by_year = get_movies_by_year(year)
    all_ids_by_year=[get_movie_id(movie) for movie in all_movies_by_year]
    year_df = pd.DataFrame({'Title': all_movies_by_year, 'Year': year,'Id':all_ids_by_year})
    df = pd.concat([df, year_df], ignore_index=True)

In [105]:
df.to_csv('movies_by_year.csv', index=False)