In [1]:
import pandas as pd
import os

In [9]:
for dirname, _, filenames in os.walk('../datasets/movies-kaggle-df'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../datasets/movies-kaggle-df\mymoviedb.csv


In [10]:
# read file
path_file = '../datasets/movies-kaggle-df/mymoviedb.csv'
movies = pd.read_csv(path_file, engine='python', on_bad_lines='skip', quotechar='"')

#### **Exercise 1: Understanding the Anatomy of a DataFrame**
**Objective:** Identify the structure of a DataFrame.

##### **Task:**
1. Print the first and last 5 rows of the dataset.

In [17]:
print(movies.head().to_numpy()) # first 5 titles
print("=============================================")
print(movies.tail().to_numpy()) # last 5 titles

[['2021-12-15' 'Spider-Man: No Way Home'
  'Peter Parker is unmasked and no longer able to separate his normal life from the high-stakes of being a super-hero. When he asks for help from Doctor Strange the stakes become even more dangerous, forcing him to discover what it truly means to be Spider-Man.'
  5083.954 '8940' '8.3' 'en' 'Action, Adventure, Science Fiction'
  'https://image.tmdb.org/t/p/original/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg']
 ['2022-03-01' 'The Batman'
  'In his second year of fighting crime, Batman uncovers corruption in Gotham City that connects to his own family while facing a serial killer known as the Riddler.'
  3827.658 '1151' '8.1' 'en' 'Crime, Mystery, Thriller'
  'https://image.tmdb.org/t/p/original/74xTEgt7R36Fpooo50r9T25onhq.jpg']
 ['2022-02-25' 'No Exit'
  'Stranded at a rest stop in the mountains during a blizzard, a recovering addict discovers a kidnapped child hidden in a car belonging to one of the people inside the building which sets her on a terrifying

2. Display the number of rows and columns.

In [15]:
movies_shape = movies.shape
print(f'Number of rows: {movies_shape[0]}\nNumber of columns: {movies_shape[1]}')

Number of rows: 9837
Number of columns: 9


3. Retrieve a summary of the dataset.

In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9837 non-null   object 
 1   Title              9828 non-null   object 
 2   Overview           9828 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   object 
 5   Vote_Average       9827 non-null   object 
 6   Original_Language  9827 non-null   object 
 7   Genre              9826 non-null   object 
 8   Poster_Url         9826 non-null   object 
dtypes: float64(1), object(8)
memory usage: 691.8+ KB


#### **Exercise 2: Exploring DataFrame Attributes**
**Objective:** Understand and access key DataFrame attributes.

##### **Task:**
1. Print the column names of the dataset.

In [18]:
movies.columns.to_list()

['Release_Date',
 'Title',
 'Overview',
 'Popularity',
 'Vote_Count',
 'Vote_Average',
 'Original_Language',
 'Genre',
 'Poster_Url']

2. Retrieve a list of column names that contain numerical values.

In [28]:
movies.select_dtypes(include='number').columns.to_list()

['Popularity']

3. Display the index of the dataset.

In [30]:
movies.index

RangeIndex(start=0, stop=9837, step=1)

4. Check the dimensions of the dataset.

In [31]:
movies.shape

(9837, 9)

5. Identify the total memory usage of the dataset.

In [46]:
# Identify the total memory usage of the dataset.
print(f"Memory Used: {movies.memory_usage(deep=True).sum()}")
movies.info()

Memory Used: 7914000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9837 entries, 0 to 9836
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Release_Date       9827 non-null   datetime64[ns]
 1   Title              9828 non-null   object        
 2   Overview           9828 non-null   object        
 3   Popularity         9827 non-null   float64       
 4   Vote_Count         9827 non-null   object        
 5   Vote_Average       9827 non-null   object        
 6   Original_Language  9827 non-null   object        
 7   Genre              9826 non-null   object        
 8   Poster_Url         9826 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 691.8+ KB


#### **Exercise 3: Checking Data Types**
**Objective:** Identify the data types of each column.

##### **Task:**
1. Check the data types of all columns.

In [39]:
movies.dtypes.value_counts()

object     8
float64    1
Name: count, dtype: int64

3. Identify which columns contain categorical data.

In [41]:
movies.select_dtypes(include='object').columns.to_list()

['Release_Date',
 'Title',
 'Overview',
 'Vote_Count',
 'Vote_Average',
 'Original_Language',
 'Genre',
 'Poster_Url']

4. Convert the `Release_Date` column to a proper datetime format.

In [42]:
print(movies.Release_Date.dtype)
movies['Release_Date'] = pd.to_datetime(movies['Release_Date'], errors='coerce')
print(movies.Release_Date.dtype)

dtype('O')

#### **Exercise 4: Selecting a Column**
**Objective:** Extract specific columns from a DataFrame.

##### **Task:**
1. Select and print the `Title` column.

In [48]:
movies['Title'].head()

0    Spider-Man: No Way Home
1                 The Batman
2                    No Exit
3                    Encanto
4             The King's Man
Name: Title, dtype: object

2. Select the `Genre` and `Vote_Average` columns.

In [49]:
movies[['Genre', 'Vote_Average']].head()

Unnamed: 0,Genre,Vote_Average
0,"Action, Adventure, Science Fiction",8.3
1,"Crime, Mystery, Thriller",8.1
2,Thriller,6.3
3,"Animation, Comedy, Family, Fantasy",7.7
4,"Action, Adventure, Thriller, War",7.0


3. Extract the first 10 movie titles.

In [51]:
first_ten_movies = movies.loc[:9, 'Title']
first_ten_movies

0    Spider-Man: No Way Home
1                 The Batman
2                    No Exit
3                    Encanto
4             The King's Man
5               The Commando
6                     Scream
7                       Kimi
8       Fistful of Vengeance
9                   Eternals
Name: Title, dtype: object

4. Select all movies released after 2010.

In [52]:
movies_released_after_2010 = movies[movies['Release_Date'] > '2010-01-01']
movies_released_after_2010

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...
...,...,...,...,...,...,...,...,...,...
9826,2015-01-20,The Atticus Institute,"In the early 1970s, Dr. Henry West creates an ...",13.364,255,5.7,en,Horror,https://image.tmdb.org/t/p/original/43dvapnp9D...
9828,2013-08-23,All Is Lost,"During a solo voyage in the Indian Ocean, a ve...",13.359,1319,6.6,en,"Action, Adventure, Drama",https://image.tmdb.org/t/p/original/9cVA4oX2xH...
9833,2020-10-01,Violent Delights,A female vampire falls in love with a man she ...,13.356,8,3.5,es,Horror,https://image.tmdb.org/t/p/original/4b6HY7rud6...
9834,2016-05-06,The Offering,When young and successful reporter Jamie finds...,13.355,94,5.0,en,"Mystery, Thriller, Horror",https://image.tmdb.org/t/p/original/h4uMM1wOhz...


5. Retrieve movies that belong to the "Action" genre and have a popularity score above 50.

In [54]:
action_movies = movies[(movies['Genre'] == 'Action') & (movies['Popularity'] > 50)]
action_movies

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
37,2021-11-05,One Shot,"An elite squad of Navy SEALs, on a covert miss...",622.24,382,6.8,en,Action,https://image.tmdb.org/t/p/original/3OXiTjU30g...
136,2021-05-06,Crazy Fist,"After an opponent dies mid-match, a prominent ...",230.547,44,7.3,zh,Action,https://image.tmdb.org/t/p/original/jVAEVDNdUP...
264,2021-08-19,The Protégé,Rescued as a child by the legendary assassin M...,160.697,531,6.6,en,Action,https://image.tmdb.org/t/p/original/o9FY8N5c8C...
288,2009-09-29,Ninja Assassin,"Ninja Assassin follows Raizo, one of the deadl...",152.29,1115,6.5,en,Action,https://image.tmdb.org/t/p/original/x9UdRFI4o0...
297,2021-11-15,Never Back Down: Revolt,An amateur fighter is lured by a trafficking s...,149.856,45,6.6,en,Action,https://image.tmdb.org/t/p/original/icAG01wZyy...
302,2021-08-18,Sweet Girl,A man vows to bring justice to those responsib...,148.93,850,6.8,en,Action,https://image.tmdb.org/t/p/original/cP7odDzzFB...
309,2021-03-06,American Badger,A seemingly cold-blooded hitman is assigned to...,147.77,26,5.9,en,Action,https://image.tmdb.org/t/p/original/xoNWf2GnEF...
344,2021-01-08,Redemption Day,When his wife is kidnapped by terrorists for r...,137.585,194,6.3,en,Action,https://image.tmdb.org/t/p/original/y1Tk1LUwS3...
464,2021-09-10,Kate,A ruthless criminal operative has less than 24...,110.146,969,6.7,en,Action,https://image.tmdb.org/t/p/original/uJgdT1boTS...
1453,2014-09-26,Outcast,A mysterious warrior teams up with the daughte...,51.919,370,5.0,en,Action,https://image.tmdb.org/t/p/original/efsgCTlofz...


#### **Exercise 5: Renaming Columns**
**Objective:** Modify column names for better readability.

##### **Task:**
1. Convert all column names to lowercase.

In [56]:
print(movies.columns.to_numpy())
movies.columns = movies.columns.str.lower().str.replace(' ', '_')
print(movies.columns.to_numpy())

['Release_Date' 'Title' 'Overview' 'Popularity' 'Vote_Count'
 'Vote_Average' 'Original_Language' 'Genre' 'Poster_Url']
['release_date' 'title' 'overview' 'popularity' 'vote_count'
 'vote_average' 'original_language' 'genre' 'poster_url']


2. Rename:
    - `Vote_Average` → `Avg_Rating`
    - `Vote_Count` → `Total_Votes`
    - `Original_Language` → `Language`


In [60]:
new_columns = {'vote_average': 'avg_rating', 'vote_count': 'total_votes', 'original_language': 'language'}
movies.rename(columns=new_columns, inplace=True)

In [61]:
movies.columns.to_list()

['release_date',
 'title',
 'overview',
 'popularity',
 'total_votes',
 'avg_rating',
 'language',
 'genre',
 'poster_url']

3. Create a function that takes a DataFrame and automatically cleans column names by:
    - Converting them to lowercase.
    - Removing special characters.
    - Replacing spaces with underscores.

In [74]:
def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = df.columns.str.lower().str.replace('[^a-z0-9]', '', regex=True).str.replace(' ', '_')
    return df

In [78]:
movies.rename(columns={'genre': 'genre$'}, inplace=True)

In [79]:
print(movies.columns.to_list())

['releasedate', 'title', 'overview', 'popularity', 'totalvotes', 'avgrating', 'language', 'genre$', 'posterurl']


In [80]:
movies = clean_columns(movies)
print(movies.columns.to_list())

['releasedate', 'title', 'overview', 'popularity', 'totalvotes', 'avgrating', 'language', 'genre', 'posterurl']
