## Fetch Hindi Movies From Wekipidia 

In [2]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests.exceptions import Timeout, RequestException, ConnectionError
import numpy as np
from functools import lru_cache
from tmdbv3api import TMDb, Movie
from concurrent.futures import ThreadPoolExecutor, as_completed

tmdb = TMDb()
tmdb.api_key = '6d8bfe0dbef34d25e64f64cabec93e20'
tmdb_movie = Movie()

# Using LRU cache to store results of previously fetched movie genres
@lru_cache(maxsize=1000)
def fetch_movie_genres(movie_id):
    try:
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
        if response.status_code != 200:
            return np.nan
        
        data_json = response.json()
        if 'genres' in data_json and data_json['genres']:
            return " ".join([genre['name'] for genre in data_json['genres']])
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching genres for movie ID {movie_id}: {e}")
        return np.nan

def get_genres(x):
    try:
        result = tmdb_movie.search(x)
        if not result:
            return np.nan
        
        movie_id = result[0].id
        return fetch_movie_genres(movie_id)
    except Exception as e:
        print(f"Error fetching genres for {x}: {e}")
        return np.nan

# Setup retry strategy
retry_strategy = Retry(
    total=5,  # Total number of retries
    backoff_factor=2,  # Exponential backoff factor
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    method_whitelist=["HEAD", "GET", "OPTIONS"]  # Retry only on these methods
)

# Create an HTTP adapter with the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)

# Create a session and mount the adapter
session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)

def get(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if actor_list[0]:
            return actor_list[0].strip()
    return None

def get2(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 2:
            return actor_list[1].strip()
        else:
            return np.nan
    return np.nan

def get3(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 3:
            return actor_list[2].strip()
        else:
            return np.nan
    return np.nan

def fetch_html(url):
    try:
        response = session.get(url, timeout=20)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.content
    except Timeout:
        print(f"Request timed out for URL {url}.")
        return None
    except ConnectionError as e:
        print(f"Connection error for URL {url}: {e}")
        return None
    except RequestException as e:
        print(f"Request error fetching data from URL {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error fetching data from URL {url}: {e}")
        return None

def create_df(url):
    # Fetch HTML content from the URL
    html_content = fetch_html(url)

    # Read HTML data using pandas if content is fetched successfully
    if html_content:
        try:
            # Reading multiple tables on the page
            tables = pd.read_html(html_content, header=0)
        
            # Extract specific tables
            df1 = tables[2]
            df2 = tables[3]
            df3 = tables[4]
            df4 = tables[5]
        
        except Exception as e:
            print(f"Error reading HTML content: {e}")
    else:
        print("Failed to fetch HTML content.")
    
    movies = pd.concat([df1, df2, df3, df4], axis=0).reset_index()

    movies = movies[['Title', 'Director', 'Cast']]
    movies['actor_1_name'] = movies['Cast'].apply(lambda x: get(x))
    movies['actor_2_name'] = movies['Cast'].apply(lambda x: get2(x))
    movies['actor_3_name'] = movies['Cast'].apply(lambda x: get3(x))
    movies = movies.rename(columns={'Title': 'movie_title', "Director": "director_name"})

    movies.drop('Cast', inplace=True, axis=1)
    movies['movie_title'] = movies['movie_title'].str.lower()

    movies['comb'] = movies['actor_1_name'] + " " + movies['actor_2_name'] + " " + movies['actor_3_name'] + " " + movies['director_name']
    
    # Parallelize genre fetching
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_title = {executor.submit(get_genres, title): title for title in movies['movie_title']}
        for future in as_completed(future_to_title):
            title = future_to_title[future]
            try:
                genres = future.result()
                movies.loc[movies['movie_title'] == title, 'genres'] = genres
            except Exception as e:
                print(f"Error fetching genres for {title}: {e}")
    
    return movies




  retry_strategy = Retry(


In [6]:
data_2018_movies = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2018')

  movies.loc[movies['movie_title'] == title, 'genres'] = genres


Error fetching genres for nan: quote_from_bytes() expected bytes


In [16]:
data_2018_movies.duplicated().sum()

0

In [17]:
data_2018_movies.drop("comb",axis=1,inplace=True)

In [18]:
data_2018_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,1921,Vikram Bhatt,Zareen Khan,Karan Kundra,Tobby Hinston,History Drama War
1,kaalakaandi,Akshat Verma,Saif Ali Khan,Isha Talwar,Shenaz Treasury,Comedy Thriller Drama
2,mukkabaaz,Anurag Kashyap,Vineet Kumar Singh,Zoya Hussain,Ravi Kishan,Drama
3,phir se...,Kunal Kohli,Jennifer Winget,Kunal Kohli,Rajit Kapur,Comedy Drama
4,my birthday song,Samir Soni,Sanjay Suri,Nora Fatehi,Pitobash,Thriller
...,...,...,...,...,...,...
99,rajma chawal,Leena Yadav,Rishi Kapoor,Anirudh Tanwar,Amyra Dastur,Comedy Drama Family
100,kedarnath,Abhishek Kapoor,Sushant Singh Rajput,Sara Ali Khan,,Romance
101,ascharyachakit!,Samit Kakkad,Priyanka Bose,Vaibhav Raj Gupta,Ankit Raaj,Thriller Drama
102,zero,Aanand L. Rai,Shah Rukh Khan,Katrina Kaif,Anushka Sharma,Animation Drama


In [19]:
data_2018_movies.isnull().sum()

movie_title       1
director_name     1
actor_1_name      2
actor_2_name      5
actor_3_name     21
genres            4
dtype: int64

In [20]:
len(data_2018_movies)

104

In [21]:
data_2018_movies.dropna(inplace=True)

In [23]:
data_2018_movies['comb'] = data_2018_movies['actor_1_name'] + " " + data_2018_movies['actor_2_name'] + " " + data_2018_movies['actor_3_name'] + " " + data_2018_movies['director_name']

In [24]:
data_2018_movies.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
comb             0
dtype: int64

In [25]:
data_2018_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,1921,Vikram Bhatt,Zareen Khan,Karan Kundra,Tobby Hinston,History Drama War,Zareen Khan Karan Kundra Tobby Hinston Vikram ...
1,kaalakaandi,Akshat Verma,Saif Ali Khan,Isha Talwar,Shenaz Treasury,Comedy Thriller Drama,Saif Ali Khan Isha Talwar Shenaz Treasury Aksh...
2,mukkabaaz,Anurag Kashyap,Vineet Kumar Singh,Zoya Hussain,Ravi Kishan,Drama,Vineet Kumar Singh Zoya Hussain Ravi Kishan An...
3,phir se...,Kunal Kohli,Jennifer Winget,Kunal Kohli,Rajit Kapur,Comedy Drama,Jennifer Winget Kunal Kohli Rajit Kapur Kunal ...
4,my birthday song,Samir Soni,Sanjay Suri,Nora Fatehi,Pitobash,Thriller,Sanjay Suri Nora Fatehi Pitobash Samir Soni
...,...,...,...,...,...,...,...
98,2.0,S Shankar,Rajnikanth,Amy Jackson,Akshay Kumar,Action Science Fiction Fantasy,Rajnikanth Amy Jackson Akshay Kumar S Shankar
99,rajma chawal,Leena Yadav,Rishi Kapoor,Anirudh Tanwar,Amyra Dastur,Comedy Drama Family,Rishi Kapoor Anirudh Tanwar Amyra Dastur Leena...
101,ascharyachakit!,Samit Kakkad,Priyanka Bose,Vaibhav Raj Gupta,Ankit Raaj,Thriller Drama,Priyanka Bose Vaibhav Raj Gupta Ankit Raaj Sam...
102,zero,Aanand L. Rai,Shah Rukh Khan,Katrina Kaif,Anushka Sharma,Animation Drama,Shah Rukh Khan Katrina Kaif Anushka Sharma Aan...


In [26]:
data_2018_movies.to_csv("data_2018_movies.csv",index=False)

In [11]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests.exceptions import Timeout, RequestException, ConnectionError
import numpy as np
from functools import lru_cache
from tmdbv3api import TMDb, Movie
from concurrent.futures import ThreadPoolExecutor, as_completed

tmdb = TMDb()
tmdb.api_key = '6d8bfe0dbef34d25e64f64cabec93e20'
tmdb_movie = Movie()

# Using LRU cache to store results of previously fetched movie genres
@lru_cache(maxsize=1000)
def fetch_movie_genres(movie_id):
    try:
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
        if response.status_code != 200:
            return np.nan
        
        data_json = response.json()
        if 'genres' in data_json and data_json['genres']:
            return " ".join([genre['name'] for genre in data_json['genres']])
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching genres for movie ID {movie_id}: {e}")
        return np.nan

def get_genres(x):
    try:
        result = tmdb_movie.search(x)
        if not result:
            return np.nan
        
        movie_id = result[0].id
        return fetch_movie_genres(movie_id)
    except Exception as e:
        print(f"Error fetching genres for {x}: {e}")
        return np.nan

# Setup retry strategy
retry_strategy = Retry(
    total=5,  # Total number of retries
    backoff_factor=2,  # Exponential backoff factor
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    method_whitelist=["HEAD", "GET", "OPTIONS"]  # Retry only on these methods
)

# Create an HTTP adapter with the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)

# Create a session and mount the adapter
session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)

def get(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if actor_list[0]:
            return actor_list[0].strip()
    return None

def get2(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 2:
            return actor_list[1].strip()
        else:
            return np.nan
    return np.nan

def get3(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 3:
            return actor_list[2].strip()
        else:
            return np.nan
    return np.nan

def fetch_html(url):
    try:
        response = session.get(url, timeout=20)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.content
    except Timeout:
        print(f"Request timed out for URL {url}.")
        return None
    except ConnectionError as e:
        print(f"Connection error for URL {url}: {e}")
        return None
    except RequestException as e:
        print(f"Request error fetching data from URL {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error fetching data from URL {url}: {e}")
        return None

def create_df(url):
    # Fetch HTML content from the URL
    html_content = fetch_html(url)

    # Read HTML data using pandas if content is fetched successfully
    if html_content:
        try:
            # Reading multiple tables on the page
            tables = pd.read_html(html_content, header=0)
        
            # Extract specific tables

            df1 = tables[3]
            df2 = tables[4]
            df3 = tables[5]
            df4 = tables[6]
        
        except Exception as e:
            print(f"Error reading HTML content: {e}")
    else:
        print("Failed to fetch HTML content.")
    
    return df1,df2,df3,df4




  retry_strategy = Retry(


In [35]:
d1,d2,d3,d4 = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2020')

In [36]:
data_2020_movies = pd.concat([d1,d2,d3,d4]).reset_index()

In [37]:
data_2020_movies = data_2020_movies[['Title','Director','Cast']]

In [38]:
data_2020_movies

Unnamed: 0,Title,Director,Cast
0,Bamfaad,Ranjan Chandel,Aditya RawalShalini PandeyVijay Varma
1,Ateet,Tanuj Bhramar,PriyamaniSanjay SuriRajeev Khandelwal
2,Mrs. Serial Killer,Shirish Kunder,Jacqueline FernandezManoj BajpayeeMohit RainaZ...
3,Court Martial,Sourabh Srivastava,Rajeev KhandelwalSaksham DaymaSwapnil Kotiwar
4,What Are the Odds,Megha Ramaswamy,Yashaswini DayamaKaranvir MalhotraAbhay Deol
...,...,...,...
61,Unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,Gulshan DevaiahSumeet VyasSaiyami KherRicha Ch...
62,AK vs AK,Vikramaditya Motwane,Anil KapoorAnurag Kashyap
63,Coolie No. 1,David Dhawan,Varun DhawanSara Ali Khan
64,Shakeela,Indrajit Lankesh,Richa ChaddaPankaj TripathiRajeev Pillai


In [39]:
data_2020_movies['Cast'] = data_2020_movies['Cast'].fillna('')

import re

# Apply the split function
data_2020_movies['Cast'] = data_2020_movies['Cast'].apply(
    lambda x: re.split(r'(?<=[a-z])(?=[A-Z])', x)
)


In [40]:
data_2020_movies

Unnamed: 0,Title,Director,Cast
0,Bamfaad,Ranjan Chandel,"[Aditya Rawal, Shalini Pandey, Vijay Varma]"
1,Ateet,Tanuj Bhramar,"[Priyamani, Sanjay Suri, Rajeev Khandelwal]"
2,Mrs. Serial Killer,Shirish Kunder,"[Jacqueline Fernandez, Manoj Bajpayee, Mohit R..."
3,Court Martial,Sourabh Srivastava,"[Rajeev Khandelwal, Saksham Dayma, Swapnil Kot..."
4,What Are the Odds,Megha Ramaswamy,"[Yashaswini Dayama, Karanvir Malhotra, Abhay D..."
...,...,...,...
61,Unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,"[Gulshan Devaiah, Sumeet Vyas, Saiyami Kher, R..."
62,AK vs AK,Vikramaditya Motwane,"[Anil Kapoor, Anurag Kashyap]"
63,Coolie No. 1,David Dhawan,"[Varun Dhawan, Sara Ali Khan]"
64,Shakeela,Indrajit Lankesh,"[Richa Chadda, Pankaj Tripathi, Rajeev Pillai]"


In [41]:
data_2020_movies['Cast'][2]

['Jacqueline Fernandez', 'Manoj Bajpayee', 'Mohit Raina', 'Zayn Marie']

In [6]:
def get_actor1(text):

    actor_list = []

    for i in text:

        if i!="" and i != np.nan:
            actor_list.append(i)
    if len(actor_list)>=1:
        return actor_list[0]
    else:
        return np.nan


In [42]:
data_2020_movies['actor_1_name'] = data_2020_movies['Cast'].apply(lambda x:get_actor1(x))

In [7]:
def get_actor2(text):

    actor_list = []

    for i in text:

        if i!="" and i != np.nan:
            actor_list.append(i)
    if len(actor_list)>=2:
        return actor_list[1]
    else:
        return np.nan


In [43]:
data_2020_movies['actor_2_name'] = data_2020_movies['Cast'].apply(lambda x:get_actor2(x))

In [8]:
def get_actor3(text):

    actor_list = []

    for i in text:

        if i!="" and i != np.nan:
            actor_list.append(i)
    if len(actor_list)>=3:
        return actor_list[2]
    else:
        return np.nan


In [45]:
data_2020_movies['actor_3_name'] = data_2020_movies['Cast'].apply(lambda x:get_actor3(x))

In [46]:
data_2020_movies

Unnamed: 0,Title,Director,Cast,actor_1_name,actor_2_name,actor_3_name
0,Bamfaad,Ranjan Chandel,"[Aditya Rawal, Shalini Pandey, Vijay Varma]",Aditya Rawal,Shalini Pandey,Vijay Varma
1,Ateet,Tanuj Bhramar,"[Priyamani, Sanjay Suri, Rajeev Khandelwal]",Priyamani,Sanjay Suri,Rajeev Khandelwal
2,Mrs. Serial Killer,Shirish Kunder,"[Jacqueline Fernandez, Manoj Bajpayee, Mohit R...",Jacqueline Fernandez,Manoj Bajpayee,Mohit Raina
3,Court Martial,Sourabh Srivastava,"[Rajeev Khandelwal, Saksham Dayma, Swapnil Kot...",Rajeev Khandelwal,Saksham Dayma,Swapnil Kotiwar
4,What Are the Odds,Megha Ramaswamy,"[Yashaswini Dayama, Karanvir Malhotra, Abhay D...",Yashaswini Dayama,Karanvir Malhotra,Abhay Deol
...,...,...,...,...,...,...
61,Unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,"[Gulshan Devaiah, Sumeet Vyas, Saiyami Kher, R...",Gulshan Devaiah,Sumeet Vyas,Saiyami Kher
62,AK vs AK,Vikramaditya Motwane,"[Anil Kapoor, Anurag Kashyap]",Anil Kapoor,Anurag Kashyap,
63,Coolie No. 1,David Dhawan,"[Varun Dhawan, Sara Ali Khan]",Varun Dhawan,Sara Ali Khan,
64,Shakeela,Indrajit Lankesh,"[Richa Chadda, Pankaj Tripathi, Rajeev Pillai]",Richa Chadda,Pankaj Tripathi,Rajeev Pillai


In [47]:
data_2020_movies.drop('Cast',axis=1,inplace=True)

In [48]:
data_2020_movies

Unnamed: 0,Title,Director,actor_1_name,actor_2_name,actor_3_name
0,Bamfaad,Ranjan Chandel,Aditya Rawal,Shalini Pandey,Vijay Varma
1,Ateet,Tanuj Bhramar,Priyamani,Sanjay Suri,Rajeev Khandelwal
2,Mrs. Serial Killer,Shirish Kunder,Jacqueline Fernandez,Manoj Bajpayee,Mohit Raina
3,Court Martial,Sourabh Srivastava,Rajeev Khandelwal,Saksham Dayma,Swapnil Kotiwar
4,What Are the Odds,Megha Ramaswamy,Yashaswini Dayama,Karanvir Malhotra,Abhay Deol
...,...,...,...,...,...
61,Unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,Gulshan Devaiah,Sumeet Vyas,Saiyami Kher
62,AK vs AK,Vikramaditya Motwane,Anil Kapoor,Anurag Kashyap,
63,Coolie No. 1,David Dhawan,Varun Dhawan,Sara Ali Khan,
64,Shakeela,Indrajit Lankesh,Richa Chadda,Pankaj Tripathi,Rajeev Pillai


In [49]:
data_2020_movies['Title'] = data_2020_movies['Title'].str.lower()

In [50]:
data_2020_movies = data_2020_movies.rename(columns={'Title':'movie_title','Director':'director_name'})

In [51]:
data_2020_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name
0,bamfaad,Ranjan Chandel,Aditya Rawal,Shalini Pandey,Vijay Varma
1,ateet,Tanuj Bhramar,Priyamani,Sanjay Suri,Rajeev Khandelwal
2,mrs. serial killer,Shirish Kunder,Jacqueline Fernandez,Manoj Bajpayee,Mohit Raina
3,court martial,Sourabh Srivastava,Rajeev Khandelwal,Saksham Dayma,Swapnil Kotiwar
4,what are the odds,Megha Ramaswamy,Yashaswini Dayama,Karanvir Malhotra,Abhay Deol
...,...,...,...,...,...
61,unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,Gulshan Devaiah,Sumeet Vyas,Saiyami Kher
62,ak vs ak,Vikramaditya Motwane,Anil Kapoor,Anurag Kashyap,
63,coolie no. 1,David Dhawan,Varun Dhawan,Sara Ali Khan,
64,shakeela,Indrajit Lankesh,Richa Chadda,Pankaj Tripathi,Rajeev Pillai


In [52]:
data_2020_movies['genres'] = data_2020_movies['movie_title'].apply(lambda x:get_genres(x))

Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for bahut hua samman: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes


In [55]:
data_2020_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,bamfaad,Ranjan Chandel,Aditya Rawal,Shalini Pandey,Vijay Varma,Romance Drama
1,ateet,Tanuj Bhramar,Priyamani,Sanjay Suri,Rajeev Khandelwal,Thriller Drama
2,mrs. serial killer,Shirish Kunder,Jacqueline Fernandez,Manoj Bajpayee,Mohit Raina,Thriller
3,court martial,Sourabh Srivastava,Rajeev Khandelwal,Saksham Dayma,Swapnil Kotiwar,Drama
4,what are the odds,Megha Ramaswamy,Yashaswini Dayama,Karanvir Malhotra,Abhay Deol,Drama Family
...,...,...,...,...,...,...
61,unpaused,Raj & D.KNitya MehraNikkhil AdvaniTannishtha C...,Gulshan Devaiah,Sumeet Vyas,Saiyami Kher,Comedy Drama
62,ak vs ak,Vikramaditya Motwane,Anil Kapoor,Anurag Kashyap,,Comedy Action Drama Crime
63,coolie no. 1,David Dhawan,Varun Dhawan,Sara Ali Khan,,Comedy Drama
64,shakeela,Indrajit Lankesh,Richa Chadda,Pankaj Tripathi,Rajeev Pillai,Drama Romance


In [56]:
data_2020_movies.isnull().sum()

movie_title       2
director_name     2
actor_1_name      2
actor_2_name      2
actor_3_name     15
genres            7
dtype: int64

In [57]:
len(data_2020_movies)

66

In [58]:
data_2020_movies.duplicated().sum()

1

In [59]:
data_2020_movies.drop_duplicates(subset='movie_title',inplace=True)

In [60]:
data_2020_movies.dropna(inplace=True)

In [61]:
len(data_2020_movies)

46

In [62]:
data_2020_movies.duplicated().sum()

0

In [63]:
data_2020_movies.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [64]:
data_2020_movies.to_csv('data_2020_movies.csv',index=False)

In [65]:
data_2020_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,bamfaad,Ranjan Chandel,Aditya Rawal,Shalini Pandey,Vijay Varma,Romance Drama
1,ateet,Tanuj Bhramar,Priyamani,Sanjay Suri,Rajeev Khandelwal,Thriller Drama
2,mrs. serial killer,Shirish Kunder,Jacqueline Fernandez,Manoj Bajpayee,Mohit Raina,Thriller
3,court martial,Sourabh Srivastava,Rajeev Khandelwal,Saksham Dayma,Swapnil Kotiwar,Drama
4,what are the odds,Megha Ramaswamy,Yashaswini Dayama,Karanvir Malhotra,Abhay Deol,Drama Family
5,ghoomketu,Pushpendra Nath Misra,Nawazuddin Siddiqui,Ragini Khanna,Richa Chadha,Comedy Drama
6,chippa,Safdar Rehman,Sunny Pawar,Joyraj Bhattacharya,Sumeet Thakur,Drama
7,chintu ka birthday,Devanshu KumarSatyanshu Singh,Vinay Pathak,Tillotama Shome,Nate Scholz,Drama
8,choked,Anurag Kashyap,Saiyami Kher,Roshan Mathew,Amruta Subhash,Drama Crime Thriller
9,axone,Nicholas Kharkongor,Dolly Ahluwalia,Sayani Gupta,Tenzin Dalha,Drama Comedy


In [66]:
d1,d2,d3,d4 = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2021')

In [67]:
data_2021_movies = pd.concat([d1,d2,d3,d4]).reset_index()

In [68]:
data_2021_movies = data_2021_movies[['Title','Director','Cast']]

In [69]:
data_2021_movies

Unnamed: 0,Title,Director,Cast
0,Flight,Suraj Joshi,Mohit ChaddaPavan MalhotraZakir HussainVivek V...
1,Koi Jaane Na,Amin Hajee,Kunal KapoorAmyra Dastur
2,The Big Bull,Kookie Gulati,Abhishek BachchanIleana D'CruzSohum ShahNikita...
3,Hello Charlie,Pankaj Saraswat,Aadar JainJackie ShroffElnaaz NorouziShlokka P...
4,99 Songs[b],Vishwesh Krishnamoorthy,Ehan BhatEdilsy VargasTenzin DalhaAditya SealL...
...,...,...,...
66,83,Kabir Khan,Ranveer SinghDeepika PadukoneTahir Raj BhasinJ...
67,Atrangi Re,Aanand L. Rai,DhanushSara Ali KhanAkshay Kumar
68,Murder at Teesri Manzil 302,Navneet Baj Saini,Irrfan KhanRanvir ShoreyDeepal ShawLucky Ali
69,Waah Zindagi,Dinesh S Yadav,Naveen KasturiaPlabita BorthakurVijay RaazSanj...


In [70]:
data_2021_movies['Cast'] = data_2021_movies['Cast'].fillna('')

import re

# Apply the split function
data_2021_movies['Cast'] = data_2021_movies['Cast'].apply(
    lambda x: re.split(r'(?<=[a-z])(?=[A-Z])', x)
)


In [71]:
data_2021_movies

Unnamed: 0,Title,Director,Cast
0,Flight,Suraj Joshi,"[Mohit Chadda, Pavan Malhotra, Zakir Hussain, ..."
1,Koi Jaane Na,Amin Hajee,"[Kunal Kapoor, Amyra Dastur]"
2,The Big Bull,Kookie Gulati,"[Abhishek Bachchan, Ileana D'Cruz, Sohum Shah,..."
3,Hello Charlie,Pankaj Saraswat,"[Aadar Jain, Jackie Shroff, Elnaaz Norouzi, Sh..."
4,99 Songs[b],Vishwesh Krishnamoorthy,"[Ehan Bhat, Edilsy Vargas, Tenzin Dalha, Adity..."
...,...,...,...
66,83,Kabir Khan,"[Ranveer Singh, Deepika Padukone, Tahir Raj Bh..."
67,Atrangi Re,Aanand L. Rai,"[Dhanush, Sara Ali Khan, Akshay Kumar]"
68,Murder at Teesri Manzil 302,Navneet Baj Saini,"[Irrfan Khan, Ranvir Shorey, Deepal Shaw, Luck..."
69,Waah Zindagi,Dinesh S Yadav,"[Naveen Kasturia, Plabita Borthakur, Vijay Raa..."


In [72]:
data_2021_movies['actor_1_name'] = data_2021_movies['Cast'].apply(lambda x:get_actor1(x))
data_2021_movies['actor_2_name'] = data_2021_movies['Cast'].apply(lambda x:get_actor2(x))
data_2021_movies['actor_3_name'] = data_2021_movies['Cast'].apply(lambda x:get_actor3(x))

In [73]:
data_2021_movies

Unnamed: 0,Title,Director,Cast,actor_1_name,actor_2_name,actor_3_name
0,Flight,Suraj Joshi,"[Mohit Chadda, Pavan Malhotra, Zakir Hussain, ...",Mohit Chadda,Pavan Malhotra,Zakir Hussain
1,Koi Jaane Na,Amin Hajee,"[Kunal Kapoor, Amyra Dastur]",Kunal Kapoor,Amyra Dastur,
2,The Big Bull,Kookie Gulati,"[Abhishek Bachchan, Ileana D'Cruz, Sohum Shah,...",Abhishek Bachchan,Ileana D'Cruz,Sohum Shah
3,Hello Charlie,Pankaj Saraswat,"[Aadar Jain, Jackie Shroff, Elnaaz Norouzi, Sh...",Aadar Jain,Jackie Shroff,Elnaaz Norouzi
4,99 Songs[b],Vishwesh Krishnamoorthy,"[Ehan Bhat, Edilsy Vargas, Tenzin Dalha, Adity...",Ehan Bhat,Edilsy Vargas,Tenzin Dalha
...,...,...,...,...,...,...
66,83,Kabir Khan,"[Ranveer Singh, Deepika Padukone, Tahir Raj Bh...",Ranveer Singh,Deepika Padukone,Tahir Raj Bhasin
67,Atrangi Re,Aanand L. Rai,"[Dhanush, Sara Ali Khan, Akshay Kumar]",Dhanush,Sara Ali Khan,Akshay Kumar
68,Murder at Teesri Manzil 302,Navneet Baj Saini,"[Irrfan Khan, Ranvir Shorey, Deepal Shaw, Luck...",Irrfan Khan,Ranvir Shorey,Deepal Shaw
69,Waah Zindagi,Dinesh S Yadav,"[Naveen Kasturia, Plabita Borthakur, Vijay Raa...",Naveen Kasturia,Plabita Borthakur,Vijay Raaz


In [74]:
data_2021_movies.drop('Cast',axis=1,inplace=True)

In [75]:
data_2021_movies['Title'] = data_2021_movies['Title'].str.lower()

In [77]:
data_2021_movies

Unnamed: 0,Title,Director,actor_1_name,actor_2_name,actor_3_name
0,flight,Suraj Joshi,Mohit Chadda,Pavan Malhotra,Zakir Hussain
1,koi jaane na,Amin Hajee,Kunal Kapoor,Amyra Dastur,
2,the big bull,Kookie Gulati,Abhishek Bachchan,Ileana D'Cruz,Sohum Shah
3,hello charlie,Pankaj Saraswat,Aadar Jain,Jackie Shroff,Elnaaz Norouzi
4,99 songs[b],Vishwesh Krishnamoorthy,Ehan Bhat,Edilsy Vargas,Tenzin Dalha
...,...,...,...,...,...
66,83,Kabir Khan,Ranveer Singh,Deepika Padukone,Tahir Raj Bhasin
67,atrangi re,Aanand L. Rai,Dhanush,Sara Ali Khan,Akshay Kumar
68,murder at teesri manzil 302,Navneet Baj Saini,Irrfan Khan,Ranvir Shorey,Deepal Shaw
69,waah zindagi,Dinesh S Yadav,Naveen Kasturia,Plabita Borthakur,Vijay Raaz


In [78]:
data_2021_movies['genres'] = data_2021_movies['Title'].apply(lambda x:get_genres(x))

Error fetching genres for 99 songs[b]: attribute name must be string, not 'int'
Error fetching genres for thalaivii[c]: attribute name must be string, not 'int'
Error fetching genres for haathi mere saathi[d]: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes


In [79]:
data_2021_movies

Unnamed: 0,Title,Director,actor_1_name,actor_2_name,actor_3_name,genres
0,flight,Suraj Joshi,Mohit Chadda,Pavan Malhotra,Zakir Hussain,Drama
1,koi jaane na,Amin Hajee,Kunal Kapoor,Amyra Dastur,,Thriller Mystery
2,the big bull,Kookie Gulati,Abhishek Bachchan,Ileana D'Cruz,Sohum Shah,Crime Drama
3,hello charlie,Pankaj Saraswat,Aadar Jain,Jackie Shroff,Elnaaz Norouzi,Adventure Comedy
4,99 songs[b],Vishwesh Krishnamoorthy,Ehan Bhat,Edilsy Vargas,Tenzin Dalha,
...,...,...,...,...,...,...
66,83,Kabir Khan,Ranveer Singh,Deepika Padukone,Tahir Raj Bhasin,Drama History
67,atrangi re,Aanand L. Rai,Dhanush,Sara Ali Khan,Akshay Kumar,Comedy Drama Romance
68,murder at teesri manzil 302,Navneet Baj Saini,Irrfan Khan,Ranvir Shorey,Deepal Shaw,Thriller Drama
69,waah zindagi,Dinesh S Yadav,Naveen Kasturia,Plabita Borthakur,Vijay Raaz,Comedy Drama Romance


In [80]:
data_2021_movies = data_2021_movies.rename(columns={'Title':'movie_title',"Director":"director_name"})

In [81]:
data_2021_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,flight,Suraj Joshi,Mohit Chadda,Pavan Malhotra,Zakir Hussain,Drama
1,koi jaane na,Amin Hajee,Kunal Kapoor,Amyra Dastur,,Thriller Mystery
2,the big bull,Kookie Gulati,Abhishek Bachchan,Ileana D'Cruz,Sohum Shah,Crime Drama
3,hello charlie,Pankaj Saraswat,Aadar Jain,Jackie Shroff,Elnaaz Norouzi,Adventure Comedy
4,99 songs[b],Vishwesh Krishnamoorthy,Ehan Bhat,Edilsy Vargas,Tenzin Dalha,
...,...,...,...,...,...,...
66,83,Kabir Khan,Ranveer Singh,Deepika Padukone,Tahir Raj Bhasin,Drama History
67,atrangi re,Aanand L. Rai,Dhanush,Sara Ali Khan,Akshay Kumar,Comedy Drama Romance
68,murder at teesri manzil 302,Navneet Baj Saini,Irrfan Khan,Ranvir Shorey,Deepal Shaw,Thriller Drama
69,waah zindagi,Dinesh S Yadav,Naveen Kasturia,Plabita Borthakur,Vijay Raaz,Comedy Drama Romance


In [82]:
data_2021_movies.drop_duplicates(subset="movie_title",inplace=True)

In [83]:
data_2021_movies.isnull().sum()

movie_title       1
director_name     1
actor_1_name      1
actor_2_name      3
actor_3_name     16
genres           13
dtype: int64

In [84]:
data_2021_movies.dropna(inplace=True)

In [85]:
data_2021_movies.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [87]:
len(data_2021_movies)

46

In [88]:
data_2021_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,flight,Suraj Joshi,Mohit Chadda,Pavan Malhotra,Zakir Hussain,Drama
2,the big bull,Kookie Gulati,Abhishek Bachchan,Ileana D'Cruz,Sohum Shah,Crime Drama
3,hello charlie,Pankaj Saraswat,Aadar Jain,Jackie Shroff,Elnaaz Norouzi,Adventure Comedy
6,ajeeb daastaans,Shashank KhaitanRaj MehtaNeeraj GhaywanKayoze ...,Fatima Sana Shaikh,Jaideep Ahlawat,Nushrratt Bharuccha,Drama Romance
7,bansuri: the flute,Hari Viswanath,Anurag Kashyap,Rituparna Sengupta,Upendra Limaye,Drama
8,raat baaki hai,Avinash Das,Paoli Dam,Anup Soni,Rahul Dev,Crime Thriller Drama
12,radhe,Prabhu Deva,Salman Khan,Disha Patani,Jackie Shroff,Action
13,sardar ka grandson,Kaashvie Nair,Arjun Kapoor,Rakul Preet Singh,Neena Gupta,Comedy Drama Romance Family
14,shaadisthan,Raj Singh Chaudhary,Kirti Kulhari,Medha Shankar,Nivedita Bhattacharya,Adventure Drama Music
15,skater girl,Manjari Makijany,Rachel Sanchita Gupta,Shraddha Gaikwad,Amrit Maghera,Drama Family


In [89]:
data_2021_movies.to_csv('data_2021_movies.csv',index=False)

In [90]:
d1,d2,d3,d4 = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2022')

data_2022_movies = pd.concat([d1,d2,d3,d4]).reset_index()

data_2022_movies = data_2022_movies[['Title','Director','Cast']]

data_2022_movies['Cast'] = data_2022_movies['Cast'].fillna('')

import re

# Apply the split function
data_2022_movies['Cast'] = data_2022_movies['Cast'].apply(
    lambda x: re.split(r'(?<=[a-z])(?=[A-Z])', x)
)
data_2022_movies['actor_1_name'] = data_2022_movies['Cast'].apply(lambda x:get_actor1(x))
data_2022_movies['actor_2_name'] = data_2022_movies['Cast'].apply(lambda x:get_actor2(x))
data_2022_movies['actor_3_name'] = data_2022_movies['Cast'].apply(lambda x:get_actor3(x))

data_2022_movies.drop('Cast',axis=1,inplace=True)

data_2022_movies['Title'] = data_2022_movies['Title'].str.lower()

data_2022_movies['genres'] = data_2022_movies['Title'].apply(lambda x:get_genres(x))

data_2022_movies = data_2022_movies.rename(columns={'Title':'movie_title',"Director":"director_name"})

Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for rocketry: the nambi effect[c]: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for dedh lakh ka dulha: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes


In [91]:
data_2022_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,36 farmhouse,Ram Ramesh Sharm,Vijay Raaz,Sanjay Mishra,Amol Parashar,Comedy Drama Mystery
1,hai tujhe salaam india,Avanish Kumar,Aarya Babbar,Ajaz Khan,Smita Gondkar,
2,looop lapeta,Aakash Bhatia,Taapsee Pannu,Tahir Raj Bhasin,,Action Comedy Crime
3,gehraiyaan,Shakun Batra,Deepika Padukone,Siddhant Chaturvedi,Ananya Panday,Romance Drama
4,badhaai do,Harshavardhan Kulkarni,Rajkummar Rao,Bhumi Pednekar,,Comedy Drama
...,...,...,...,...,...,...
107,ajay wardhan,Pragati Agarwal,Ruslaan Mumtaz,Arjumman Mughal,,
108,trahimam,Dushyant Pratap Singh,Arshi Khan,Pankaj Berry,,
109,cirkus,Rohit Shetty,Ranveer Singh,Pooja Hegde,Jacqueline Fernandez,
110,dedh lakh ka dulha,Abhay Pratap Singh,Akhilendra Mishra,Ishtiyak Khan,Harshita Panwar,


In [92]:
data_2022_movies.dropna(inplace=True)

In [93]:
data_2022_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,36 farmhouse,Ram Ramesh Sharm,Vijay Raaz,Sanjay Mishra,Amol Parashar,Comedy Drama Mystery
3,gehraiyaan,Shakun Batra,Deepika Padukone,Siddhant Chaturvedi,Ananya Panday,Romance Drama
5,a thursday,Behzad Khambata,Yami Gautam,Neha Dhupia,Atul Kulkarni,Crime Drama
6,gangubai kathiawadi,Sanjay Leela Bhansali,Alia Bhatt,Ajay Devgn,Shantanu Maheshwari,Crime Drama History
7,love hostel,Shanker Raman,Sanya Malhotra,Vikrant Massey,Bobby Deol,Crime Romance Thriller
...,...,...,...,...,...,...
95,mister mummy,Shaad Ali,Riteish Deshmukh,Genelia Deshmukh,Mahesh Manjrekar,Comedy Drama
97,qala,Anvita Dutt,Tripti Dimri,Swastika Mukherjee,Babil Khan,Drama
100,india lockdown,Madhur Bhandarkar,Shweta Basu Prasad,Aahana Kumra,Prateik Babbar,Drama Adventure
103,maarrich,Dhruv Lather,Tusshar Kapoor,Naseeruddin Shah,Rahul Dev,Thriller Crime


In [94]:
data_2022_movies.drop_duplicates(subset='movie_title')

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,36 farmhouse,Ram Ramesh Sharm,Vijay Raaz,Sanjay Mishra,Amol Parashar,Comedy Drama Mystery
3,gehraiyaan,Shakun Batra,Deepika Padukone,Siddhant Chaturvedi,Ananya Panday,Romance Drama
5,a thursday,Behzad Khambata,Yami Gautam,Neha Dhupia,Atul Kulkarni,Crime Drama
6,gangubai kathiawadi,Sanjay Leela Bhansali,Alia Bhatt,Ajay Devgn,Shantanu Maheshwari,Crime Drama History
7,love hostel,Shanker Raman,Sanya Malhotra,Vikrant Massey,Bobby Deol,Crime Romance Thriller
...,...,...,...,...,...,...
95,mister mummy,Shaad Ali,Riteish Deshmukh,Genelia Deshmukh,Mahesh Manjrekar,Comedy Drama
97,qala,Anvita Dutt,Tripti Dimri,Swastika Mukherjee,Babil Khan,Drama
100,india lockdown,Madhur Bhandarkar,Shweta Basu Prasad,Aahana Kumra,Prateik Babbar,Drama Adventure
103,maarrich,Dhruv Lather,Tusshar Kapoor,Naseeruddin Shah,Rahul Dev,Thriller Crime


In [95]:
data_2022_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,36 farmhouse,Ram Ramesh Sharm,Vijay Raaz,Sanjay Mishra,Amol Parashar,Comedy Drama Mystery
3,gehraiyaan,Shakun Batra,Deepika Padukone,Siddhant Chaturvedi,Ananya Panday,Romance Drama
5,a thursday,Behzad Khambata,Yami Gautam,Neha Dhupia,Atul Kulkarni,Crime Drama
6,gangubai kathiawadi,Sanjay Leela Bhansali,Alia Bhatt,Ajay Devgn,Shantanu Maheshwari,Crime Drama History
7,love hostel,Shanker Raman,Sanya Malhotra,Vikrant Massey,Bobby Deol,Crime Romance Thriller
...,...,...,...,...,...,...
95,mister mummy,Shaad Ali,Riteish Deshmukh,Genelia Deshmukh,Mahesh Manjrekar,Comedy Drama
97,qala,Anvita Dutt,Tripti Dimri,Swastika Mukherjee,Babil Khan,Drama
100,india lockdown,Madhur Bhandarkar,Shweta Basu Prasad,Aahana Kumra,Prateik Babbar,Drama Adventure
103,maarrich,Dhruv Lather,Tusshar Kapoor,Naseeruddin Shah,Rahul Dev,Thriller Crime


In [96]:
data_2022_movies.to_csv('data_2022_movies.csv',index=False)

In [97]:
d1,d2,d3,d4 = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2023')

data_2023_movies = pd.concat([d1,d2,d3,d4]).reset_index()

data_2023_movies = data_2023_movies[['Title','Director','Cast']]

data_2023_movies['Cast'] = data_2023_movies['Cast'].fillna('')

import re

# Apply the split function
data_2023_movies['Cast'] = data_2023_movies['Cast'].apply(
    lambda x: re.split(r'(?<=[a-z])(?=[A-Z])', x)
)
data_2023_movies['actor_1_name'] = data_2023_movies['Cast'].apply(lambda x:get_actor1(x))
data_2023_movies['actor_2_name'] = data_2023_movies['Cast'].apply(lambda x:get_actor2(x))
data_2023_movies['actor_3_name'] = data_2023_movies['Cast'].apply(lambda x:get_actor3(x))

data_2023_movies.drop('Cast',axis=1,inplace=True)

data_2023_movies['Title'] = data_2023_movies['Title'].str.lower()

data_2023_movies['genres'] = data_2023_movies['Title'].apply(lambda x:get_genres(x))

data_2023_movies = data_2023_movies.rename(columns={'Title':'movie_title',"Director":"director_name"})

Error fetching genres for operation fryday[b]: attribute name must be string, not 'int'
Error fetching genres for the tenant[c]: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for mystery of the tattoo: attribute name must be string, not 'int'
Error fetching genres for sab moh maaya hai: attribute name must be string, not 'int'


In [98]:
data_2023_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,kuttey,Aasmaan Bhardwaj,Arjun Kapoor,Tabu,Konkona Sen Sharma,Comedy Crime Thriller
1,lakadbaggha,Victor Mukherjee,Anshuman Jha,Riddhi Dogra,Milind Soman,Action Crime Thriller
2,mission majnu,Shantanu Bagchi,Sidharth Malhotra,Rashmika Mandanna,Kumud Mishra,Thriller Action Drama History
3,chhatriwali,Tejas Deoskar,Rakul Preet Singh,Sumeet Vyas,Satish Kaushik,Comedy Drama
4,pathaan,Siddharth Anand,Shah Rukh Khan,John Abraham,Deepika Padukone,Action Adventure Thriller Drama
...,...,...,...,...,...,...
131,kaisi ye dor,Ratna Neelam Pandey Sandeep S. Choudhary,Nikhil Pandey,Jashn Agnihotri,Ratna Neelam Pandey,Drama Family
132,dunki,Rajkumar Hirani,Shah Rukh Khan,Taapsee Pannu,Vicky Kaushal,War Action Drama
133,dry day,Saurabh Shukla,Jitendra Kumar,Annu Kapoor,Shriya Pilgaonkar,Comedy Drama
134,kho gaye hum kahan,Arjun Varain Singh,Adarsh Gourav,Siddhant Chaturvedi,Ananya Panday,Drama Comedy


In [99]:
data_2023_movies.duplicated().sum()

0

In [100]:
data_2023_movies.isnull().sum()

movie_title       1
director_name     1
actor_1_name      1
actor_2_name      5
actor_3_name     22
genres           14
dtype: int64

In [101]:
data_2023_movies.dropna(inplace=True)

In [102]:
len(data_2023_movies)

104

In [103]:
data_2023_movies.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [104]:
data_2023_movies.to_csv("data_2023_movies.csv",index=False)

In [17]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests.exceptions import Timeout, RequestException, ConnectionError
import numpy as np
from functools import lru_cache
from tmdbv3api import TMDb, Movie
from concurrent.futures import ThreadPoolExecutor, as_completed

tmdb = TMDb()
tmdb.api_key = '6d8bfe0dbef34d25e64f64cabec93e20'
tmdb_movie = Movie()

# Using LRU cache to store results of previously fetched movie genres
@lru_cache(maxsize=1000)
def fetch_movie_genres(movie_id):
    try:
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
        if response.status_code != 200:
            return np.nan
        
        data_json = response.json()
        if 'genres' in data_json and data_json['genres']:
            return " ".join([genre['name'] for genre in data_json['genres']])
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching genres for movie ID {movie_id}: {e}")
        return np.nan

def get_genres(x):
    try:
        result = tmdb_movie.search(x)
        if not result:
            return np.nan
        
        movie_id = result[0].id
        return fetch_movie_genres(movie_id)
    except Exception as e:
        print(f"Error fetching genres for {x}: {e}")
        return np.nan

# Setup retry strategy
retry_strategy = Retry(
    total=5,  # Total number of retries
    backoff_factor=2,  # Exponential backoff factor
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    method_whitelist=["HEAD", "GET", "OPTIONS"]  # Retry only on these methods
)

# Create an HTTP adapter with the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)

# Create a session and mount the adapter
session = requests.Session()
session.mount("https://", adapter)
session.mount("http://", adapter)

def get(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if actor_list[0]:
            return actor_list[0].strip()
    return None

def get2(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 2:
            return actor_list[1].strip()
        else:
            return np.nan
    return np.nan

def get3(x):
    if isinstance(x, str):
        actor_list = x.split(",")
        if len(actor_list) >= 3:
            return actor_list[2].strip()
        else:
            return np.nan
    return np.nan

def fetch_html(url):
    try:
        response = session.get(url, timeout=20)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.content
    except Timeout:
        print(f"Request timed out for URL {url}.")
        return None
    except ConnectionError as e:
        print(f"Connection error for URL {url}: {e}")
        return None
    except RequestException as e:
        print(f"Request error fetching data from URL {url}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error fetching data from URL {url}: {e}")
        return None

def create_df(url):
    # Fetch HTML content from the URL
    html_content = fetch_html(url)

    # Read HTML data using pandas if content is fetched successfully
    if html_content:
        try:
            # Reading multiple tables on the page
            tables = pd.read_html(html_content, header=0)
        
            # Extract specific tables

            df1 = tables[4]
            df2 = tables[5]
            df3 = tables[6]
            df4 = tables[7]
        
        except Exception as e:
            print(f"Error reading HTML content: {e}")
    else:
        print("Failed to fetch HTML content.")
    
    return df1,df2,df3,df4




  retry_strategy = Retry(


In [18]:
d1,d2,d3,d4 = create_df('https://en.wikipedia.org/wiki/List_of_Hindi_films_of_2024')

data_2024_movies = pd.concat([d1,d2,d3,d4]).reset_index()

data_2024_movies = data_2024_movies[['Title','Director','Cast']]

data_2024_movies['Cast'] = data_2024_movies['Cast'].fillna('')

import re

# Apply the split function
data_2024_movies['Cast'] = data_2024_movies['Cast'].apply(
    lambda x: re.split(r'(?<=[a-z])(?=[A-Z])', x)
)
data_2024_movies['actor_1_name'] = data_2024_movies['Cast'].apply(lambda x:get_actor1(x))
data_2024_movies['actor_2_name'] = data_2024_movies['Cast'].apply(lambda x:get_actor2(x))
data_2024_movies['actor_3_name'] = data_2024_movies['Cast'].apply(lambda x:get_actor3(x))

data_2024_movies.drop('Cast',axis=1,inplace=True)

data_2024_movies['Title'] = data_2024_movies['Title'].str.lower()

data_2024_movies['genres'] = data_2024_movies['Title'].apply(lambda x:get_genres(x))

data_2024_movies = data_2024_movies.rename(columns={'Title':'movie_title',"Director":"director_name"})

Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for operation valentine[b]: attribute name must be string, not 'int'
Error fetching genres for swatantrya veer savarkar: attribute name must be string, not 'int'
Error fetching genres for nan: quote_from_bytes() expected bytes
Error fetching genres for the legacy of jineshwar: attribute name must be string, not 'int'
Error fetching genres for tipppsy: attribute name must be string, not 'int'
Error fetching genres for hamare baarah: attribute name must be string, not 'int'
Error fetching genres for kalki 2898 ad[b]: attribute name must be string, not 'int'
Error fetching genres for chhaava: attribute name must be string, not 'int'


In [19]:
data_2024_movies

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres
0,tauba tera jalwa,Akashaditya Lama,Jatin Khurana,Ameesha Patel,Angela Krislinzki,Romance Thriller
1,merry christmas[a],Sriram Raghavan,Katrina Kaif,Vijay Sethupathi,,Comedy Family
2,main atal hoon,Ravi Jadhav,Pankaj Tripathi,,,Drama
3,fighter,Siddharth Anand,Hrithik Roshan,Deepika Padukone,Anil Kapoor,Action Thriller War
4,,,,,,
...,...,...,...,...,...,...
76,raid 2,Raj Kumar Gupta,Ajay Devgn,Riteish Deshmukh,Vaani Kapoor,Action Crime Thriller
77,metro... in dino,Anurag Basu,Aditya Roy Kapur,Sara Ali Khan,Anupam Kher,Drama Romance
78,chhaava,Laxman Utekar,Vicky Kaushal,Rashmika Mandanna,,
79,welcome to the jungle,Ahmed Khan,Akshay Kumar,Sanjay Dutt,Suniel Shetty,Comedy


In [20]:
data_2024_movies.duplicated().sum()

1

In [21]:
data_2024_movies.drop_duplicates(inplace=True)

In [22]:
data_2024_movies.isnull().sum()

movie_title       1
director_name     1
actor_1_name      1
actor_2_name      4
actor_3_name     14
genres           17
dtype: int64

In [23]:
len(data_2024_movies)

80

In [28]:
data_2024_movies.dropna(inplace=True)

In [29]:
data_2024_movies.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [30]:
len(data_2024_movies)

53

In [31]:
data_2024_movies.to_csv("data_2024_movies.csv",index=False)

In [71]:
data1 = pd.read_csv('D:\Movies\movies_upto_2017.csv')
data2 = pd.read_csv('D:\Movies\data_2018_movies.csv')
data3 = pd.read_csv('D:\Movies\data_2019_movies.csv')
data4 = pd.read_csv('D:\Movies\data_2020_movies.csv')
data5 = pd.read_csv('D:\Movies\data_2021_movies.csv')
data6 = pd.read_csv('D:\Movies\data_2022_movies.csv')
data7 = pd.read_csv("D:\Movies\data_2023_movies.csv")
data8 = pd.read_csv("D:\Movies\data_2024_movies.csv")

In [72]:
print(len(data1)+len(data2)+len(data3)+len(data4)+len(data5)+len(data6)+len(data7)+len(data8))

5745


In [73]:
data1.isnull().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [74]:
data2.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
comb             0
dtype: int64

In [75]:
data3.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [76]:
data4.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [77]:
data5.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [78]:
data6.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [79]:
data7.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [80]:
data8.isnull().sum()

movie_title      0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
dtype: int64

In [81]:
data1.head(1)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...


In [82]:
data2.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,1921,Vikram Bhatt,Zareen Khan,Karan Kundra,Tobby Hinston,History Drama War,Zareen Khan Karan Kundra Tobby Hinston Vikram ...


In [83]:
data3['comb'] = data3['actor_1_name']+" "+data3['actor_2_name']+" "+data3['actor_3_name']+" "+data3['director_name']
data4['comb'] = data4['actor_1_name']+" "+data4['actor_2_name']+" "+data4['actor_3_name']+" "+data4['director_name']
data5['comb'] = data5['actor_1_name']+" "+data5['actor_2_name']+" "+data5['actor_3_name']+" "+data5['director_name']
data6['comb'] = data6['actor_1_name']+" "+data6['actor_2_name']+" "+data6['actor_3_name']+" "+data6['director_name']
data7['comb'] = data7['actor_1_name']+" "+data7['actor_2_name']+" "+data7['actor_3_name']+" "+data7['director_name']
data8['comb'] = data8['actor_1_name']+" "+data8['actor_2_name']+" "+data8['actor_3_name']+" "+data8['director_name']

In [84]:
data3.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,salt bridge,Abhijit Deonath,Rajeev Khandelwal,Chelsie Preston Crayford,Usha Jadhav,Drama,Rajeev Khandelwal Chelsie Preston Crayford Ush...


In [85]:
data4.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,bamfaad,Ranjan Chandel,Aditya Rawal,Shalini Pandey,Vijay Varma,Romance Drama,Aditya Rawal Shalini Pandey Vijay Varma Ranjan...


In [86]:
data5.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,flight,Suraj Joshi,Mohit Chadda,Pavan Malhotra,Zakir Hussain,Drama,Mohit Chadda Pavan Malhotra Zakir Hussain Sura...


In [87]:
data6.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,36 farmhouse,Ram Ramesh Sharm,Vijay Raaz,Sanjay Mishra,Amol Parashar,Comedy Drama Mystery,Vijay Raaz Sanjay Mishra Amol Parashar Ram Ram...


In [88]:
data7.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,kuttey,Aasmaan Bhardwaj,Arjun Kapoor,Tabu,Konkona Sen Sharma,Comedy Crime Thriller,Arjun Kapoor Tabu Konkona Sen Sharma Aasmaan B...


In [89]:
data8.head(1)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,actor_3_name,genres,comb
0,tauba tera jalwa,Akashaditya Lama,Jatin Khurana,Ameesha Patel,Angela Krislinzki,Romance Thriller,Jatin Khurana Ameesha Patel Angela Krislinzki ...


In [90]:
final_movies_hindi_data = pd.concat([data1,data2,data3,data4,data5,data6,data7,data8],axis=0)

In [91]:
final_movies_hindi_data.isnull().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [92]:
print(len(final_movies_hindi_data))

5745


In [93]:
final_movies_hindi_data.to_csv('final_movies_hindi_data.csv',index=False)