In [2]:
# Setup
import pandas as pd

df = pd.read_csv("./Data/imdb_top_1000.csv")
df.dtypes


Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [3]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
# Question 1 - Get rid of useless columns

# For our purposes, we don't need the link to an image of the poster

df.drop('Poster_Link', axis=1, inplace = True)   

df.head()




Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [5]:
#Question 2

min_value_row = df.loc[df['Released_Year'].idxmin()]

print("The oldest movie is " + min_value_row['Series_Title'] +" (" + str(int(min_value_row['Released_Year'])) + ")") 

#Question 3 

# We need to clean up the data. 
# Convert the released data to numbers and convert anything invalid into NaN.

df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')


max_value_row = df.loc[df['Released_Year'].idxmax()]

print("The newest movie is " + max_value_row['Series_Title'] +" (" + str(int(max_value_row['Released_Year'])) + ")") 



The oldest movie is Das Cabinet des Dr. Caligari (1920)
The newest movie is Hamilton (2020)


In [6]:
#Question 4  Top 10 movies by IMDB rating
top_10_movies = df.sort_values(by='IMDB_Rating', ascending=False).head(10)
print(top_10_movies[['Series_Title', 'IMDB_Rating']])

                                     Series_Title  IMDB_Rating
0                        The Shawshank Redemption          9.3
1                                   The Godfather          9.2
4                                    12 Angry Men          9.0
2                                 The Dark Knight          9.0
3                          The Godfather: Part II          9.0
5   The Lord of the Rings: The Return of the King          8.9
7                                Schindler's List          8.9
6                                    Pulp Fiction          8.9
8                                       Inception          8.8
12                Il buono, il brutto, il cattivo          8.8


In [7]:
#Question 5 - Top movie from each genre
df_copy = df.copy()
df_copy['Genre'] = df_copy['Genre'].astype(str)
df_copy['Genre'] = df_copy['Genre'].str.split(',')    # Some movies have more than one genre
df_copy = df_copy.explode('Genre')
df_copy['Genre'] = df_copy['Genre'].apply(lambda x: x.strip())  #Formatting of genres is inconsistent, some have extra whitespace


#Create a helper function that returns the max of a group. Then use groupby to apply that helper function to each group
def get_max_score(group):
    return group.loc[group['IMDB_Rating'].idxmax()]

top_movies_by_genre = df_copy.groupby('Genre').apply(get_max_score, include_groups=False)
print(top_movies_by_genre[['Series_Title', 'IMDB_Rating']])


                                             Series_Title  IMDB_Rating
Genre                                                                 
Action                                    The Dark Knight          9.0
Adventure   The Lord of the Rings: The Return of the King          8.9
Animation                   Sen to Chihiro no kamikakushi          8.6
Biography                                Schindler's List          8.9
Comedy                                       Gisaengchung          8.6
Crime                                       The Godfather          9.2
Drama                            The Shawshank Redemption          9.3
Family                      Sen to Chihiro no kamikakushi          8.6
Fantasy    Star Wars: Episode V - The Empire Strikes Back          8.7
Film-Noir                                    Sunset Blvd.          8.4
History                                  Schindler's List          8.9
Horror                                             Psycho          8.5
Music 

In [8]:
#Question 6, Director with the most movies

df['Director'].value_counts().idxmax()



'Alfred Hitchcock'

In [9]:
#Question 7, Star with the most movies. Different because a star can be Star1, Star2, Star3, OR Star4 and they all must be aggregated.
df[['Star1', 'Star2', 'Star3', 'Star4']].stack().value_counts().idxmax()


'Robert De Niro'

In [10]:
#Question 8, Highest grossing move for each genre
df["Gross"] = [float(str(i).replace(",", "")) for i in df["Gross"]]   #function that converts comma'd numbers stored as strings into floats

df_copy = df.copy()
df_copy['Genre'] = df_copy['Genre'].astype(str)
df_copy['Genre'] = df_copy['Genre'].str.split(',')    # Some movies have more than one genre
df_copy = df_copy.explode('Genre')
df_copy['Genre'] = df_copy['Genre'].apply(lambda x: x.strip())  #Formatting of genres is inconsistent, some have extra whitespace


#Create a helper function that returns the max of a group. Then use groupby to apply that helper function to each group
def get_max_score(group):
    return group.loc[group['Gross'].idxmax()]

top_movies_by_genre = df_copy.groupby('Genre').apply(get_max_score, include_groups=False)
print(top_movies_by_genre[['Series_Title', 'Gross']])

                                         Series_Title        Gross
Genre                                                             
Action     Star Wars: Episode VII - The Force Awakens  936662225.0
Adventure  Star Wars: Episode VII - The Force Awakens  936662225.0
Animation                               Incredibles 2  608581744.0
Biography                              The Blind Side  255959475.0
Comedy                                    Toy Story 4  434038008.0
Crime                                 The Dark Knight  534858444.0
Drama                               Avengers: Endgame  858373000.0
Family                     E.T. the Extra-Terrestrial  435110554.0
Fantasy                                        Avatar  760507625.0
Film-Noir                                   Notorious   10464000.0
History                            Gone with the Wind  198676459.0
Horror                                   The Exorcist  232906145.0
Music                               Bohemian Rhapsody  2164280

In [11]:
#question 9 save as parquet file
df.to_parquet('imdb_top_1000.parquet')   #have to install pyarrow as a dependency

In [13]:
#Question 10
print(df['Gross'].sum())

56536877976.0


In [16]:
# Exercise 2
import pydicom
from PIL import Image
import numpy as np
import imageio
import os

dicom_dir = './Data/input_images'
output_gif_path = './Data/output_gif.gif'

# Load all DICOM files from the directory
dicom_files = [os.path.join(dicom_dir, f) for f in os.listdir(dicom_dir) if f.endswith('.dcm')]

frames = []

for dicom_file in dicom_files:

    dicom_image = pydicom.dcmread(dicom_file)
    
    # Extract the pixel data
    pixel_array = dicom_image.pixel_array
    
    pixel_array = (pixel_array - np.min(pixel_array)) / (np.max(pixel_array) - np.min(pixel_array)) * 255
    pixel_array = pixel_array.astype(np.uint8)
    
    image = Image.fromarray(pixel_array)
    frames.append(image)

# Save frames as an animated GIF
imageio.mimsave(output_gif_path, frames, format='GIF', duration=5) 