In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50'
}

movie_name = []
certificate = []
year = []
time = []
genre = []
rating = []
metascore = []
votes = []
gross = []
image_links = []
descrides = []
casts = []

for page in range(1, 50):  # Change the range to the desired number of pages
    url = f"https://www.imdb.com/list/ls063676660/?sort=list_order,asc&st_dt=&mode=detail&page={page}"

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    movie_data = soup.find_all("div", attrs={"class": "lister-item mode-detail"})

    for store in movie_data:
        # Extract the image link from the 'data-tconst' attribute of the 'img' tag
        img_tag = store.find('img', class_='loadlate')
        if img_tag:
            image_link = img_tag.get('loadlate')
            image_links.append(image_link)

        name = store.h3.a.text
        movie_name.append(name)

        year_of_release = store.h3.find('span', class_='lister-item-year text-muted unbold').text.strip('()')
        year.append(year_of_release)

        # Check if the certificate element exists before accessing its .text attribute
        movie_certificate = store.p.find('span', class_='certificate')
        certificate.append(movie_certificate.text if movie_certificate else 'N/A')

        runtime = store.p.find('span', class_='runtime').text.replace(' min', '')
        time.append(runtime)

        genre_text = store.p.find('span', class_='genre').text.strip()
        genre.append(genre_text)

        rate = store.find('span', class_='ipl-rating-star__rating').text
        rating.append(rate)

        meta = store.find('span', class_='metascore')
        metascore.append(meta.text.strip() if meta else '^^^^^^')

        # Since gross and votes have the same attributes, we create a common variable and then use indexing
        value = store.find_all('span', attrs={'name': 'nv'})

        vote = value[0].text
        votes.append(vote)

        grosses = value[1].text if len(value) > 1 else '*****'
        gross.append(grosses)

        # Extract the description from the 'p' tag
        desc = store.find("p", class_='')
        desc = desc.text.replace('\n', '').split('|')
        descr = '|'.join([x.strip() for x in desc])
        descrides.append(descr)

        # Extract the cast details
        
        cast = store.find_all('p', class_ = 'text-muted')
        castts = cast[1].text.replace('\n', '') if len(cast) >1 else '*****'
        casts.append(castts)


# Create a DataFrame using the collected data
movie_df = pd.DataFrame({
    'Image Link': image_links,
    'Name of movie': movie_name,
    'Year of release': year,
    'Certificate': certificate,
    'Runtime (min)': time,
    'Genre': genre,
    'Rating': rating,
    'MetaScore': metascore,
    'Votes': votes,
    'Grosses': gross,
    'Description': descrides,
    'Casts': casts
})


In [5]:
movie_df

Unnamed: 0,Image Link,Name of movie,Year of release,Certificate,Runtime (min),Genre,Rating,MetaScore,Votes,Grosses,Description,Casts
0,https://m.media-amazon.com/images/M/MV5BMTUyND...,The Stanford Prison Experiment,2015,R,122,"Biography, Drama, History",6.8,67,45122,$0.64M,"In 1971, twenty-four male students are selecte...",Director:Kyle Patrick Alvarez| Stars:E...
1,https://m.media-amazon.com/images/M/MV5BYzgyOD...,Capricorn One,1977,,123,"Action, Adventure, Drama",6.8,38,23964,*****,When the first manned flight to Mars is deemed...,Director:Peter Hyams| Stars:Elliott Go...
2,https://m.media-amazon.com/images/M/MV5BZmYwYW...,The Last Seduction,1994,R,110,"Crime, Drama, Romance",7,85,25025,$6.14M,A devious sexpot steals her husband's drug mon...,Director:John Dahl| Stars:Linda Fioren...
3,https://m.media-amazon.com/images/M/MV5BMmM4Yz...,House on Haunted Hill,1959,,75,"Crime, Horror, Mystery",6.8,^^^^^^,29460,*****,"A millionaire offers $10,000 to five people wh...",Director:William Castle| Stars:Vincent...
4,https://m.media-amazon.com/images/M/MV5BNjAxOG...,Local Hero,1983,,111,"Comedy, Drama",7.3,84,25853,$5.90M,An American oil company has plans for a new re...,Director:Bill Forsyth| Stars:Burt Lanc...
...,...,...,...,...,...,...,...,...,...,...,...,...
4895,https://m.media-amazon.com/images/M/MV5BOGI3Ym...,The Sixth Man,1997,PG-13,107,"Comedy, Drama, Fantasy",5.6,^^^^^^,6180,$14.71M,A college athlete returns from the dead to hel...,Director:Randall Miller| Stars:Marlon ...
4896,https://m.media-amazon.com/images/M/MV5BNjFmMT...,Vincere,2009,,128,"Biography, Drama, Romance",6.8,85,5798,$0.62M,"The story of Mussolini's secret lover, Ida Dal...",Director:Marco Bellocchio| Stars:Giova...
4897,https://m.media-amazon.com/images/M/MV5BMTQ5OT...,Die Feuerzangenbowle,1944,,97,Comedy,7.7,^^^^^^,6415,*****,"A successful writer, home-schooled in his yout...",Director:Helmut Weiss| Stars:Heinz Rüh...
4898,https://m.media-amazon.com/images/M/MV5BZjYyZD...,Gloria,1999,A,108,"Crime, Drama, Thriller",5.1,26,6090,$4.16M,After serving three years in prison covering f...,Director:Sidney Lumet| Stars:Sharon St...


In [6]:
# Split the "Casts" column into "Director" and "Stars" columns
movie_df[['Director', 'Stars']] = movie_df['Casts'].str.split('|', n=1, expand=True)

# Remove the "Director:" and "Stars:" prefixes
movie_df['Director'] = movie_df['Director'].str.replace('Director:', '').str.strip()
movie_df['Stars'] = movie_df['Stars'].str.replace('Stars:', '').str.strip()

# Drop the original "Casts" column
movie_df.drop(columns=['Casts'], inplace=True)

In [7]:
movie_df.to_excel('movie_data_part2.xlsx', index=False)

print("DataFrame saved to 'movie_data.xlsx'")

DataFrame saved to 'movie_data.xlsx'
