In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#load dataset
df1 = pd.read_csv("/Users/mac/Documents/moringa/phase-2/VulcanVariance/Original_Data/rt.movie_info.tsv.gz", sep='\t')
df1.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [4]:
#check the columns with the missing values
df1.isna().sum()

id                 0
synopsis          62
rating             3
genre              8
director         199
writer           449
theater_date     359
dvd_date         359
currency        1220
box_office      1220
runtime           30
studio          1066
dtype: int64

In [5]:
duplicates = df1.duplicated().sum()
duplicates


np.int64(0)

In [6]:
# make a copy of the dataset
df = df1.copy(deep=True)

## 


# DATA CLEANING


In [7]:
# drop columns with miising values and use other files that we have
df.drop(['studio','currency','box_office'], axis=1, inplace=True)
df.shape

(1560, 9)

In [8]:
# google and fill in missing rows in rating and genre
df[df.rating.isna()] #find missing rows
df.drop([10,131,1543], inplace=True, axis=0) #drop missing rows
df.rating.isna().sum() #confirm they are dropped

np.int64(0)

In [9]:
# genre missing rows
df[df.genre.isna()]
df.drop([222,250,658,1082,1342], inplace=True, axis=0)
df.genre.isna().sum()

np.int64(0)

In [10]:
#convert theatre and dvd dtype to date
df['theater_data'] = pd.to_datetime(df['theater_date'], errors='coerce')
df['dvd_date'] = pd.to_datetime(df['dvd_date'], errors='coerce')

In [11]:
# convert runtime to float
df['runtime'] = df['runtime'].str.extract(r'(\d+)').astype(float)

In [12]:
#fill all missing values 
def missing_object_value (df):
    object_col = df.select_dtypes(include='O').columns
    df[object_col] = df[object_col].fillna('unknown')

    if 'runtime' in df.columns:
        median_runtime = df['runtime'].median()
        df['runtime'] = df['runtime'].fillna(median_runtime)

    datetime_cols = df.select_dtypes(include='datetime').columns
    for col in datetime_cols:
        df[col] = df[col].astype(object).fillna('unknown')

    return df
    
missing_object_value(df)
df.head()


Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,runtime,theater_data
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971",2001-09-25 00:00:00,104.0,1971-10-09 00:00:00
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012",2013-01-01 00:00:00,108.0,2012-08-17 00:00:00
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996",2000-04-18 00:00:00,116.0,1996-09-13 00:00:00
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994",1997-08-27 00:00:00,128.0,1994-12-09 00:00:00
4,7,unknown,NR,Drama|Romance,Rodney Bennett,Giles Cooper,unknown,unknown,200.0,unknown


In [13]:
#confirm missing values are no more
df.isna().sum()

id              0
synopsis        0
rating          0
genre           0
director        0
writer          0
theater_date    0
dvd_date        0
runtime         0
theater_data    0
dtype: int64

In [14]:
# cleaned dataset
df.to_csv('../Cleaned_Data/cleaned_rt.movie_info.csv', index=False)

## Summary
- Three columns were dropped i.e studio, rating and genre
- changed dvd_date and theater_date dtypes to datetime
- changed runtime dtype to float and dropped the "min"
- filled missing values for object and datetime to unknown
- filled missing value for float with median
 

In [15]:
df

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,runtime,theater_data
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971",2001-09-25 00:00:00,104.0,1971-10-09 00:00:00
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012",2013-01-01 00:00:00,108.0,2012-08-17 00:00:00
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996",2000-04-18 00:00:00,116.0,1996-09-13 00:00:00
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994",1997-08-27 00:00:00,128.0,1994-12-09 00:00:00
4,7,unknown,NR,Drama|Romance,Rodney Bennett,Giles Cooper,unknown,unknown,200.0,unknown
...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,unknown,unknown,"Aug 18, 2006",2007-01-02 00:00:00,106.0,2006-08-18 00:00:00
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993",2001-04-17 00:00:00,88.0,1993-07-23 00:00:00
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,unknown,"Jan 1, 1962",2004-05-11 00:00:00,111.0,1962-01-01 00:00:00
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993",2002-01-29 00:00:00,101.0,1993-04-01 00:00:00
