In [None]:
#import nessessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# 1. Read the data from .csv file

In [None]:
"""
- 1st way: data = pd.read_csv('path/to/your/file.csv')
- 2nd way: data_indexed = pd.read_csv('path/to/your/file.csv', index_col='column_name')
  The value in index_col must unique 
"""
dataset_path = 'IMDB-Movie-Data.csv'
data = pd.read_csv(dataset_path) # 1st way
data_indexed = pd.read_csv(dataset_path, index_col='Title') # 2nd way


# 2. View the data

In [None]:
"""
- data.head(): show the first 5 rows of the data
"""
data.head()

# 3. Understand some basic information about the data

In [None]:
data.info()

In [None]:
data.describe()

# 4. Data selection - Indexing and Slicing data

In [None]:
"""
From the data, we can extract any column to become a Series or Dataframe, depend on the way we extract it
"""

data['Genre'] # extract the Genre column to become a Series

In [None]:
data[['Genre']] # Extract the Genre column to become a Dataframe

In [None]:
"""
We can alse choose and extract multiple columns to become a Dataframe
"""

some_cols = data[['Title', 'Genre', 'Actors', 'Director', 'Rating']] # choose the column manually
print(some_cols)

In [None]:
data.iloc[10:15][['Title', 'Genre', 'Actors', 'Director', 'Rating']] # choose the column by index using iloc

# 5. Data selection - Based on Conditional filtering

In [None]:
"""
Take movies from 2010 to 2015 with rating < 6.0 & revenue > 95% of the data
"""

data[(data['Year'] >= 2010) & (data['Year'] <= 2015) & (data['Rating'] < 6.0) & (data['Revenue (Millions)'] > data['Revenue (Millions)'].quantile(0.95))]

# 6. Groupby Opearations

In [None]:
"""
We can group the data based on one or many columns
"""

data.groupby('Director')[['Rating']].mean().head() # group the data by Director and calculate the mean of Rating

# 7. Sorting Operations

In [None]:
"""
We can sort the data decending/ascending order based on one or many columns
"""

data.groupby('Director')[['Rating']].mean().sort_values(['Rating'], ascending=False).head() # sort the data by Rating

# 8. Missing value

In [None]:
"""
In the data, missing value is a common problem, we must handle it before processing the data
"""

data.isnull().sum() # check the missing value in the data in each columns

In [None]:
"""
1. Delete the column
- Delete entired column if it have too much missing value
- Add the paprameter inplace=True to apply the change to the original data
- axis=1: delete the column
- data.dropna(): delete the row
"""

data.drop('Metascore', axis=1).head()

In [None]:
"""
2. Filling the missing value
- We can fill the missing value by using mean, median, mode
"""

revenue_mean = data_indexed['Revenue (Millions)'].mean()
data_indexed['Revenue (Millions)'].fillna(revenue_mean, inplace=True)

# 9. apply() functions

In [None]:
"""
- We can apply our own function to the data's rows
"""

def rating_group(rating):
    if rating > 7.5:
        return 'Good'
    elif rating >= 6.0:
        return 'Average'
    else:
        return 'Bad'
    
data['Rating Category'] = data['Rating'].apply(rating_group)
data.head()