## 0. Imports

In [125]:
import pandas as pd

## 1. Data preparation and cleaning

In [126]:
df = pd.read_excel('movies.xlsx')

### Dataset info

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1402 entries, 0 to 1401
Data columns (total 31 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Film                                     1402 non-null   object 
 1   Year                                     1402 non-null   int64  
 2   Script Type                              1402 non-null   object 
 3   Rotten Tomatoes  critics                 1401 non-null   object 
 4   Metacritic  critics                      1402 non-null   object 
 5   Average critics                          1402 non-null   object 
 6   Rotten Tomatoes Audience                 1401 non-null   float64
 7   Metacritic Audience                      1402 non-null   object 
 8   Rotten Tomatoes vs Metacritic  deviance  1402 non-null   object 
 9   Average audience                         1402 non-null   object 
 10  Audience vs Critics deviance             1402 no

In [128]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,1402.0,2012.140514,3.190291,2007.0,2009.0,2012.0,2015.0,2017.0
Rotten Tomatoes Audience,1401.0,62.122769,17.435737,17.0,49.0,62.0,76.0,98.0
Opening weekend ($million),1402.0,23.200357,28.547541,0.0,6.6025,14.2,28.5975,247.97
Domestic gross ($million),1402.0,74.058024,87.389737,0.0,21.09,44.045,92.59,936.66
Worldwide Gross ($million),1402.0,173.008302,233.859025,0.0,38.0,88.69,208.75,2781.5
Distributor,0.0,,,,,,,
IMDb Rating,0.0,,,,,,,
IMDB vs RT disparity,0.0,,,,,,,


In [129]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Film,1402,1396,Big Miracle,2
Script Type,1402,17,original screenplay,546
Rotten Tomatoes critics,1401,102,93,29
Metacritic critics,1402,90,51,39
Average critics,1402,196,64,25
Metacritic Audience,1402,78,66,50
Rotten Tomatoes vs Metacritic deviance,1402,82,-1,58
Average audience,1402,116,58,45
Audience vs Critics deviance,1402,78,-5,54
Primary Genre,19,11,comedy,5


### Handle missing values

In [130]:
missing_data = df.isnull().sum()
missing_percentage = (missing_data[missing_data > 0] / df.shape[0]) * 100

In [131]:
print('Missing data\n--------------')
print(missing_data)


Missing data
--------------
Film                                          0
Year                                          0
Script Type                                   0
Rotten Tomatoes  critics                      1
Metacritic  critics                           0
Average critics                               0
Rotten Tomatoes Audience                      1
Metacritic Audience                           0
Rotten Tomatoes vs Metacritic  deviance       0
Average audience                              0
Audience vs Critics deviance                  0
Primary Genre                              1383
Genre                                         1
Opening Weekend                               0
Opening weekend ($million)                    0
Domestic Gross                                0
Domestic gross ($million)                     0
Foreign Gross ($million)                      0
Foreign Gross                                 0
Worldwide Gross                               0
Worldwide Gr

In [132]:
print('Missing data percentages\n--------------')
print(missing_percentage)

Missing data percentages
--------------
Rotten Tomatoes  critics       0.071327
Rotten Tomatoes Audience       0.071327
Primary Genre                 98.644793
Genre                          0.071327
Distributor                  100.000000
IMDb Rating                  100.000000
IMDB vs RT disparity         100.000000
Oscar Winners                 96.005706
Oscar Detail                  96.005706
dtype: float64


In [133]:
df = df.dropna(subset=['Genre', 'Rotten Tomatoes  critics', 'Rotten Tomatoes Audience '])

In [134]:
missing_data = df.isnull().sum()
missing_data

Film                                          0
Year                                          0
Script Type                                   0
Rotten Tomatoes  critics                      0
Metacritic  critics                           0
Average critics                               0
Rotten Tomatoes Audience                      0
Metacritic Audience                           0
Rotten Tomatoes vs Metacritic  deviance       0
Average audience                              0
Audience vs Critics deviance                  0
Primary Genre                              1381
Genre                                         0
Opening Weekend                               0
Opening weekend ($million)                    0
Domestic Gross                                0
Domestic gross ($million)                     0
Foreign Gross ($million)                      0
Foreign Gross                                 0
Worldwide Gross                               0
Worldwide Gross ($million)              

### Check for dublicates

In [135]:
df.duplicated(keep=False).sum()

0

### Fix `Primary Genre` and `Genre` columns

In [136]:
genres = df['Genre']
genres

0          period, action
1                 western
2                  horror
3                 musical
4          sci-fi, horror
              ...        
1397               horror
1398                drama
1399    action, adventure
1400    action, adventure
1401            animation
Name: Genre, Length: 1400, dtype: object

#### Use one of the genres from `Genre` column to fill `Primary Genre` column

In [137]:
for idx, row in df.iterrows():
    genres = row['Genre'].split(', ')
    df.at[idx, 'Primary Genre'] = genres[1] if len(genres) > 1 else genres[0]
    df.at[idx, 'Genre'] = genres[0]