In [26]:
import pandas as pd
import os

In [27]:
for dirname, _, filenames in os.walk('../datasets/movies-kaggle-df'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../datasets/movies-kaggle-df\mymoviedb.csv


In [28]:
# read file
path_file = '../datasets/movies-kaggle-df/mymoviedb.csv'
movies = pd.read_csv(path_file, engine='python', on_bad_lines='skip', quotechar='"')

In [29]:
movies.columns = movies.columns.str.replace(' ', '_').str.lower()

### **Exercise 1: Calling Series Methods**  
**Objective**: Learn how to call basic Series methods to analyze data.

**Task**:
1. Select the "Popularity" column from the dataset.
2. Use the `.min()` method to find the lowest popularity value in the dataset.
3. Use the `.max()` method to find the highest popularity value in the dataset.
4. Find the mean of the popularity column using the `.mean()` method.
5. Find the median popularity using the `.median()` method.


In [30]:
popularity = movies['popularity'] # get popularity column
print('Data type popularity:', popularity.dtype)

Data type popularity: float64


In [31]:
max_popularity = popularity.max() # get max value of popularity
min_popularity = popularity.min() # get min value of popularity
mean_popularity = popularity.mean() # get mean value of popularity
median_popularity = popularity.median() # get median value of popularity
print('Max popularity:', max_popularity)
print('Min popularity:', min_popularity)
print('Mean popularity:', mean_popularity)
print('Median popularity:', median_popularity)

Max popularity: 5083.954
Min popularity: 7.1
Mean popularity: 40.32056996031343
Median popularity: 21.191


### **Exercise 2: Series Operations**  
**Objective**: Perform basic operations on a Series.

**Task**:
1. Create a new Series by adding 10 to the "Popularity" column.
2. Create a new Series by dividing the "Popularity" column by the "Vote_Count" column.
3. Check if any value in the "Popularity" column is greater than 50.

In [32]:
print(popularity.add(10)) # add 10 to each value of popularity
vote_count = pd.to_numeric(movies['vote_count'], errors='coerce') # convert vote_count to numeric
print("====================================")
print(popularity.div(vote_count)) # divide popularity by vote_count


0       5093.954
1       3837.658
2       2628.087
3       2412.201
4       1905.511
          ...   
9832      23.357
9833      23.356
9834      23.355
9835      23.354
9836      23.354
Name: popularity, Length: 9837, dtype: float64
0        0.568675
1        3.325507
2       21.459730
3        0.473247
4        1.057173
          ...    
9832     0.014907
9833     1.669500
9834     0.142074
9835     0.087855
9836     0.071796
Length: 9837, dtype: float64


In [33]:
popularity_mask = popularity > 50 # create mask for popularity > 50
popularity_filtered = popularity[popularity_mask] # filter popularity > 50
print(popularity_filtered)

0       5083.954
1       3827.658
2       2618.087
3       2402.201
4       1895.511
          ...   
1520      50.149
1521      50.144
1522      50.118
1523      50.063
1524      50.026
Name: popularity, Length: 1514, dtype: float64


### **Exercise 3: Chaining Series Methods**  
**Objective**: Chain multiple Series methods together.

**Task**:
1. First, select the "Popularity" column.
2. Apply the `.dropna()` method to remove any missing values.
3. Apply the `.apply()` method to increase each remaining value by 20%.
4. Calculate the mean of the updated "Popularity" values using the `.mean()` method.

In [34]:
popularity.dropna(inplace=True) # Apply the `.dropna()` method to remove any missing values.
popularity.count()

np.int64(9827)

In [35]:
def increase(value):
    return value * 1.2

In [36]:
def debugger_serie(ser:pd.Series)->pd.Series:
    print("===BEFORE===")
    print(ser)
    print("===AFTER===")
    print(ser)
    return ser

In [42]:
popularity_increase = (
    popularity \
    .apply(increase) \
    .pipe(debugger_serie) \
        
)

===BEFORE===
0       6100.7448
1       4593.1896
2       3141.7044
3       2882.6412
4       2274.6132
          ...    
9832      16.0284
9833      16.0272
9834      16.0260
9835      16.0248
9836      16.0248
Name: popularity, Length: 9827, dtype: float64
===AFTER===
0       6100.7448
1       4593.1896
2       3141.7044
3       2882.6412
4       2274.6132
          ...    
9832      16.0284
9833      16.0272
9834      16.0260
9835      16.0248
9836      16.0248
Name: popularity, Length: 9827, dtype: float64


In [43]:
popularity_increase.head()

0    6100.7448
1    4593.1896
2    3141.7044
3    2882.6412
4    2274.6132
Name: popularity, dtype: float64

### **Exercise 4: Handling Missing Values**  
**Objective**: Practice handling missing values in a Series.

**Task**:
1. Identify the missing values in the "Vote_Average" column.
2. Use the `.fillna()` method to replace missing values in "Vote_Average" with the mean of the column.
3. Use the `.dropna()` method to remove rows with missing values in the "Genre" column.

In [49]:
movies.vote_average.dtype

dtype('O')

In [50]:
miss_values_vote_avg = pd.to_numeric(movies['vote_average'],errors='coerce')
print(f"Missing Values in vote_average: {miss_values_vote_avg.isna().sum()}")

Missing Values in vote_average: 11


In [52]:
miss_values_vote_avg.fillna(miss_values_vote_avg.mean(), inplace=True)
print(f"Missing Values in vote_average: {miss_values_vote_avg.isna().sum()}")

Missing Values in vote_average: 0


In [57]:
miss_values_genre = movies['genre']
print(f"Missing Values in genre: {miss_values_genre.isna().sum()}")
mask_miss_values_genre = miss_values_genre.isna()
dropna_genre = miss_values_genre[~mask_miss_values_genre]
print(f"Missing Values in genre: {dropna_genre.isna().sum()}")

Missing Values in genre: 11
Missing Values in genre: 0


In [59]:
dropna_genre.head()

0    Action, Adventure, Science Fiction
1              Crime, Mystery, Thriller
2                              Thriller
3    Animation, Comedy, Family, Fantasy
4      Action, Adventure, Thriller, War
Name: genre, dtype: object