In [2]:
# Import the dependencies
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
import seaborn as sns
import random
%matplotlib inline

In [3]:
df = pd.read_csv('../zippedData/rt.movie_info.tsv.gz', delimiter='\t')
df

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,
...,...,...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,"Aug 18, 2006","Jan 2, 2007",$,33886034,106 minutes,New Line Cinema
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,"Jul 23, 1993","Apr 17, 2001",,,88 minutes,Paramount Vantage
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,"Jan 1, 1962","May 11, 2004",,,111 minutes,
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,"Apr 1, 1993","Jan 29, 2002",,,101 minutes,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1560 non-null   int64 
 1   synopsis      1498 non-null   object
 2   rating        1557 non-null   object
 3   genre         1552 non-null   object
 4   director      1361 non-null   object
 5   writer        1111 non-null   object
 6   theater_date  1201 non-null   object
 7   dvd_date      1201 non-null   object
 8   currency      340 non-null    object
 9   box_office    340 non-null    object
 10  runtime       1530 non-null   object
 11  studio        494 non-null    object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


In [5]:
# Convert the 'theater_date' and 'dvd_date' columns to datetime type
df[['theater_date', 'dvd_date']] = df[['theater_date', 'dvd_date']].apply(pd.to_datetime, format='%b %d, %Y')

# Replace commas with empty string in the 'foreign_gross' column
df['box_office'] = df['box_office'].str.replace(',', '')
# Convert the values in the 'foreign_gross' column to float
df['box_office'] = pd.to_numeric(df['box_office'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            1560 non-null   int64         
 1   synopsis      1498 non-null   object        
 2   rating        1557 non-null   object        
 3   genre         1552 non-null   object        
 4   director      1361 non-null   object        
 5   writer        1111 non-null   object        
 6   theater_date  1201 non-null   datetime64[ns]
 7   dvd_date      1201 non-null   datetime64[ns]
 8   currency      340 non-null    object        
 9   box_office    340 non-null    float64       
 10  runtime       1530 non-null   object        
 11  studio        494 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(8)
memory usage: 146.4+ KB


In [7]:
df[df['genre'].isnull()]
#We should remove these rows.
df.dropna(subset=['genre'], inplace=True)

In [8]:
# Split the genre column on the "|" delimiter and create a list of genres for each row
df['genre_list'] = df['genre'].str.split('|')
# Create a set to store unique genres
unique_genres = set()
# Iterate over the genre lists in each row and add unique genres to the set
for genre_list in df['genre_list']:
    unique_genres.update(genre_list)

# Print the unique genres
print("Unique Genre Types:", unique_genres)
print("Number of Unique Genre Types:", len(unique_genres))

Unique Genre Types: {'Western', 'Sports and Fitness', 'Classics', 'Cult Movies', 'Kids and Family', 'Art House and International', 'Television', 'Anime and Manga', 'Documentary', 'Gay and Lesbian', 'Faith and Spirituality', 'Special Interest', 'Comedy', 'Science Fiction and Fantasy', 'Romance', 'Drama', 'Animation', 'Action and Adventure', 'Musical and Performing Arts', 'Horror', 'Mystery and Suspense'}
Number of Unique Genre Types: 21


In [9]:
df

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio,genre_list
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,1971-10-09,2001-09-25,,,104 minutes,,"[Action and Adventure, Classics, Drama]"
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,2013-01-01,$,600000.0,108 minutes,Entertainment One,"[Drama, Science Fiction and Fantasy]"
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,1996-09-13,2000-04-18,,,116 minutes,,"[Drama, Musical and Performing Arts]"
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,1994-12-09,1997-08-27,,,128 minutes,,"[Drama, Mystery and Suspense]"
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,NaT,NaT,,,200 minutes,,"[Drama, Romance]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,1996,Forget terrorists or hijackers -- there's a ha...,R,Action and Adventure|Horror|Mystery and Suspense,,,2006-08-18,2007-01-02,$,33886034.0,106 minutes,New Line Cinema,"[Action and Adventure, Horror, Mystery and Sus..."
1556,1997,The popular Saturday Night Live sketch was exp...,PG,Comedy|Science Fiction and Fantasy,Steve Barron,Terry Turner|Tom Davis|Dan Aykroyd|Bonnie Turner,1993-07-23,2001-04-17,,,88 minutes,Paramount Vantage,"[Comedy, Science Fiction and Fantasy]"
1557,1998,"Based on a novel by Richard Powell, when the l...",G,Classics|Comedy|Drama|Musical and Performing Arts,Gordon Douglas,,1962-01-01,2004-05-11,,,111 minutes,,"[Classics, Comedy, Drama, Musical and Performi..."
1558,1999,The Sandlot is a coming-of-age story about a g...,PG,Comedy|Drama|Kids and Family|Sports and Fitness,David Mickey Evans,David Mickey Evans|Robert Gunter,1993-04-01,2002-01-29,,,101 minutes,,"[Comedy, Drama, Kids and Family, Sports and Fi..."
