In [1176]:
#importing the required libraries
import pandas as pd
import matplotlib.pyplot as plt
import re

In [1177]:
#importing and reading through the data
df = pd.read_csv('movies_data.csv')
df.head(10)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
5,Outer Banks,(2020– ),"\nAction, Crime, Drama",7.6,\nA group of teenagers from the wrong side of ...,"\n \n Stars:\nChase Stokes, \nMa...",25858.0,50.0,
6,The Last Letter from Your Lover,(2021),"\nDrama, Romance",6.8,\nA pair of interwoven stories set in the past...,\n Director:\nAugustine Frizzell\n| \n S...,5283.0,110.0,
7,Dexter,(2006–2013),"\nCrime, Drama, Mystery",8.6,"\nBy day, mild-mannered Dexter is a blood-spat...","\n \n Stars:\nMichael C. Hall, \...",665387.0,53.0,
8,Never Have I Ever,(2020– ),\nComedy,7.9,\nThe complicated life of a modern-day first g...,\n \n Stars:\nMaitreyi Ramakrish...,34530.0,30.0,
9,Virgin River,(2019– ),"\nDrama, Romance",7.4,"\nSeeking a fresh start, nurse practitioner Me...",\n \n Stars:\nAlexandra Breckenr...,27279.0,44.0,


## Making a Copy of the Dataset

In [1178]:
#making a copy of my dataframe
df_movies = df.copy()

In [1179]:
#getting the information about the data
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


In [1180]:
#decription of the data
df_movies.describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


In [1181]:
#columns with null values
df_movies.isnull().any()

MOVIES      False
YEAR         True
GENRE        True
RATING       True
ONE-LINE    False
STARS       False
VOTES        True
RunTime      True
Gross        True
dtype: bool

In [1182]:
#to check out for duplicates
df_movies.columns.duplicated()

array([False, False, False, False, False, False, False, False, False])

## Data Wrangling
>  This process involves tidying, cleaning, and ensuring the data is of good quality to generate insights.

In [1183]:
#information about the data
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


## Issue 1
#### Renaming columns names to achieve uniformity

In [1184]:
#renaming columns
df_movies = df_movies.rename(columns = {'MOVIES':'movies', 'YEAR':'year','GENRE':'genre',
                                        'RATING':'rating', 'ONE-LINE':'one_line','STARS':'stars','VOTES':'votes',
                                        'RunTime':'run_time', 'Gross':'gross'})
df_movies.sample()

Unnamed: 0,movies,year,genre,rating,one_line,stars,votes,run_time,gross
998,Spinning Out,(2020),"\nDrama, Sport",7.6,\nA figure skating Olympic hopeful's struggle ...,"\n \n Stars:\nKaya Scodelario, \...",11776,48.0,


## Issue 2
#### Genre : Removing the extra white spaces, extracting the `genre` from the string using regular expression

In [1138]:
#converting the genre to str type for cleanliness
df_movies['genre'] = df_movies['genre'].astype(str)

In [1139]:
#removing the genre regex and newlines 
df_movies['genre']= df_movies['genre'].apply(lambda x: ",".join(x.strip().replace(" ","").split('\n')))
df_movies['genre']

0           Action,Horror,Thriller
1       Animation,Action,Adventure
2            Drama,Horror,Thriller
3       Animation,Adventure,Comedy
4              Action,Crime,Horror
                   ...            
9994       Adventure,Drama,Fantasy
9995    Animation,Action,Adventure
9996             Documentary,Sport
9997       Adventure,Drama,Fantasy
9998       Adventure,Drama,Fantasy
Name: genre, Length: 9999, dtype: object

## Issue 3
#### One_line: Stripping the column to extract the `one_line` column needed information

In [1140]:
#removing the extra white spaces in the one_line column
df_movies['one_line'] = df_movies['one_line'].apply(lambda x: x.strip())

## Issue 4
#### Director: Creating a new column called `director` in order to extract the `directors` details from the `stars` column details  

In [1141]:
#extracting the directors details using regex
df_movies['director'] = df_movies['stars'].apply(lambda x: re.sub("\s\s+" , " ", x.strip().split('|')[0].strip()))

In [1142]:
#function to confirm if there are directors in all the stars rows 
def director_search(x):
    if 'Director' in x: 
        director = x.split('\n')[1]
        return director
    else:
        return 'None'

In [1143]:
#applying the function to the directors column
df_movies['director'] = df_movies['director'].apply(director_search)

In [1144]:
#confirming we have only the directors name
df_movies['director'].value_counts()

None                                        3646
Rajiv Chilaka, Krishna Mohan Chintapatla      49
Adam Heydt                                    28
Baran bo Odar                                 23
Oliver Driver                                 21
                                            ... 
Anthony Stacchi                                1
Jing Wong, Wai-Lun Lam                         1
David Marconi                                  1
Shantrelle P. Lewis                            1
Ian Daniel, Elliot Page                        1
Name: director, Length: 3709, dtype: int64

## Issue 5
#### Stars : Getting the exact `stars` int the different movie set

In [1145]:
#function to get the stars info in the columns as some rows have stars while some dont
def stars_search(x):
    x = x.strip()
    if 'Director' in x: 
        x = x.split('|')
        if len(x) > 1:
            star = x[1]
        else:
            return 'None'          
    else:
        star = x
    star = star.strip()
    star = star.split('\n')[1:]
    star = ",".join(star)
    star = re.sub("\s\s+" , " ", star)
    star = star.split(',')
    star = list(filter(str.strip, star))
    star = ",".join(star)
    return star

In [1146]:
#creating a new colummn for movie stars
df_movies['movies_star'] = df_movies['stars'].apply(stars_search)

In [1147]:
#asserting and confirming this has been cleaned 
df_movies1 = df_movies.loc[df_movies['movies_star']== 'None']
df_movies1.head(2)

Unnamed: 0,movies,year,genre,rating,one_line,stars,votes,run_time,gross,director,movies_star
1367,El caso Wanninkhof-Carabantes,(2021),"Documentary,Crime,History",6.5,"In 1999, teen Rocío Wanninkhof is murdered. He...",\n Director:\nTània Balló\n,1357,88.0,,Tània Balló,
1469,Pray Away,(2021),Documentary,6.2,"Former leaders of the ""pray the gay away"" move...",\n Director:\nKristine Stolakis\n,159,101.0,,Kristine Stolakis,


## Issue 6
#### This involves data cleaning by working on the `year` column provided in the dataset to define and determine the category the movie falls into, either a completed or an ongoing movie

In [1148]:
#changing the column year to str type
df_movies['year']= df_movies['year'].astype(str)

In [1149]:
#extracting the years column 
df_movies['years'] = df_movies['year'].apply(lambda x: x.lstrip("(").rstrip(")"))

In [1150]:
#function to convert the years column to a list and specify the movie category
def dates_confirmn(x):
    dates = x.split("–")
    if len(dates) != 2:
        return 'completed_movie'
    elif dates[1] == " ":
        return 'ongoing_series'
    else:
        return 'completed_series'

In [1151]:
#applying the function
df_movies['movie_category'] = df_movies['years'].apply(dates_confirmn)

## Issue 7
#### Ratings : Filling the null values  in `ratings` with the average mean, this is because we have a low null values ratings compared to the filled ratings, i.e 1,820 null values compared to 8,179 filled values which is less than 23%

In [1152]:
#checking how many ratings rows are null
df_movies['rating'].isnull().sum()

1820

In [1153]:
df_movies['rating']

0       6.1
1       5.0
2       8.2
3       9.2
4       NaN
       ... 
9994    NaN
9995    NaN
9996    NaN
9997    NaN
9998    NaN
Name: rating, Length: 9999, dtype: float64

In [1154]:
#calculating the average mean of the ratings and fill the null values with the mean
df_movies['rating'] = df_movies['rating'].fillna(df_movies['rating'].mean()).round(1)


In [1155]:
#confirmining the null values ha
df_movies['rating'].isnull().sum()

0

## Issue 8
> Gross : The data cleaning process here is to fill the null values in `gross` column with zero as there are large null empty rows compared to the filled rows. 
- Then proceed to replace the $ dollar sign as this is not required in this data, before converting the rows with M symbol which represnts millions to  million values

In [1156]:
#confirming  the no of gross rows that are null
df_movies['gross'].isnull().sum()

9539

In [1157]:
#filling the null gross column with zero, this shows that no gross was allocated to the movie
df_movies['gross'] = df_movies['gross'].fillna(0)

In [1158]:
df_movies['gross'].isnull().sum()

0

In [1159]:
#convert to str datatype
df_movies['gross'] = df_movies['gross'].astype(str)

#extract and replace the $ values in the gross column
df_movies['gross']= df_movies['gross'].apply(lambda x: x.replace("$",""))

In [1160]:
#calling a million value convertion function
def to_millions(x):
    if 'M' in x:
        x = x.replace("M","")
        x = float(x)
        x = int(x * 1000000)
        return x
    else:
        return x
        

In [1161]:
#applying the funtion to convert the gross M to millions
df_movies['gross'] = df_movies['gross'].apply(to_millions)

In [1162]:
#converting the gross back to int and asserting the datatype
df_movies['gross'] = df_movies['gross'].astype(int)
print(df_movies['gross'].dtype)

int64


## Issue 9
#### Votes : Filling the `votes` null values with zero, take away the commas and then convert to the required datatype

In [1163]:
#filling the null votes with 0
df_movies['votes'] = df_movies['votes'].fillna(0)

In [1164]:
#convert to str datatype
df_movies['votes'] = df_movies['votes'].astype(str)

#replacing the commas with empty in the votes column
df_movies['votes'] = df_movies['votes'].apply(lambda x: x.replace(",", ""))

In [1165]:
#convert back to int 
df_movies['votes'] = df_movies['votes'].astype(int)

## Issue 10
#### Run_time : Converting the  `run_time` null values to zero

In [1166]:
#filling the null votes with 0
df_movies['run_time'] = df_movies['run_time'].fillna(0)

#convert to int type 
df_movies['run_time'] = df_movies['run_time'].astype(int)

In [1167]:
df_movies.dtypes

movies             object
year               object
genre              object
rating            float64
one_line           object
stars              object
votes               int64
run_time            int64
gross               int64
director           object
movies_star        object
years              object
movie_category     object
dtype: object

### Dropping Columns
> Unnecessary columns `year`, `stars` would be dropped off. New columns have been derived from these variables and hence, they are no longer required for further analysis

In [1168]:
#dropping the columns
df_movies.drop(['year','stars'], axis = 1, inplace = True)

In [1169]:
df_movies.head(50)

Unnamed: 0,movies,genre,rating,one_line,votes,run_time,gross,director,movies_star,years,movie_category
0,Blood Red Sky,"Action,Horror,Thriller",6.1,A woman with a mysterious illness is forced in...,21062,121,0,Peter Thorwarth,"Peri Baumeister,Carl Anton Koch,Alexander Sche...",2021,completed_movie
1,Masters of the Universe: Revelation,"Animation,Action,Adventure",5.0,The war for Eternia begins again in what may b...,17870,25,0,,"Chris Wood,Sarah Michelle Gellar,Lena Headey,M...",2021–,ongoing_series
2,The Walking Dead,"Drama,Horror,Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,885805,44,0,,"Andrew Lincoln,Norman Reedus,Melissa McBride,L...",2010–2022,completed_series
3,Rick and Morty,"Animation,Adventure,Comedy",9.2,An animated series that follows the exploits o...,414849,23,0,,"Justin Roiland,Chris Parnell,Spencer Grammer,S...",2013–,ongoing_series
4,Army of Thieves,"Action,Crime,Horror",6.9,"A prequel, set before the events of Army of th...",0,0,0,Matthias Schweighöfer,"Matthias Schweighöfer,Nathalie Emmanuel,Ruby O...",2021,completed_movie
5,Outer Banks,"Action,Crime,Drama",7.6,A group of teenagers from the wrong side of th...,25858,50,0,,"Chase Stokes,Madelyn Cline,Madison Bailey,Jona...",2020–,ongoing_series
6,The Last Letter from Your Lover,"Drama,Romance",6.8,A pair of interwoven stories set in the past a...,5283,110,0,Augustine Frizzell,"Shailene Woodley,Joe Alwyn,Wendy Nottingham,Fe...",2021,completed_movie
7,Dexter,"Crime,Drama,Mystery",8.6,"By day, mild-mannered Dexter is a blood-spatte...",665387,53,0,,"Michael C. Hall,Jennifer Carpenter,David Zayas...",2006–2013,completed_series
8,Never Have I Ever,Comedy,7.9,The complicated life of a modern-day first gen...,34530,30,0,,"Maitreyi Ramakrishnan,Poorna Jagannathan,Darre...",2020–,ongoing_series
9,Virgin River,"Drama,Romance",7.4,"Seeking a fresh start, nurse practitioner Meli...",27279,44,0,,"Alexandra Breckenridge,Martin Henderson,Colin ...",2019–,ongoing_series


## Drop Duplicates
#### There are currently 431 rows movies that are duplicates, so I would be dropping them to know the exact number of movies we have.

In [1170]:
#finding out no of duplicates
df_movies.duplicated().value_counts()

False    9568
True      431
dtype: int64

In [1171]:
duplicated_df_movies = df_movies[df_movies.duplicated()]
duplicated_df_movies.head(5)

Unnamed: 0,movies,genre,rating,one_line,votes,run_time,gross,director,movies_star,years,movie_category
6833,Mighty Little Bheem,"Animation,Short,Adventure",6.9,Add a Plot,0,0,0,"Rajiv Chilaka, Krishna Mohan Chintapatla",,2019–,ongoing_series
6835,Mighty Little Bheem,"Animation,Short,Adventure",9.0,Add a Plot,6,0,0,"Rajiv Chilaka, Krishna Mohan Chintapatla",,2019–,ongoing_series
6836,Mighty Little Bheem,"Animation,Short,Adventure",9.0,Add a Plot,6,0,0,"Rajiv Chilaka, Krishna Mohan Chintapatla",,2019–,ongoing_series
6837,Mighty Little Bheem,"Animation,Short,Adventure",6.9,Add a Plot,0,0,0,"Rajiv Chilaka, Krishna Mohan Chintapatla",,2019–,ongoing_series
6838,Mighty Little Bheem,"Animation,Short,Adventure",6.9,Add a Plot,0,0,0,"Rajiv Chilaka, Krishna Mohan Chintapatla",,2019–,ongoing_series


In [1172]:
# Delete duplicate rows based on specific column 
df_movies = df_movies.drop_duplicates(subset=['movies'], keep=False)

In [1174]:
#the cleaned dataset
df_movies.head(5)

Unnamed: 0,movies,genre,rating,one_line,votes,run_time,gross,director,movies_star,years,movie_category
0,Blood Red Sky,"Action,Horror,Thriller",6.1,A woman with a mysterious illness is forced in...,21062,121,0,Peter Thorwarth,"Peri Baumeister,Carl Anton Koch,Alexander Sche...",2021,completed_movie
1,Masters of the Universe: Revelation,"Animation,Action,Adventure",5.0,The war for Eternia begins again in what may b...,17870,25,0,,"Chris Wood,Sarah Michelle Gellar,Lena Headey,M...",2021–,ongoing_series
2,The Walking Dead,"Drama,Horror,Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,885805,44,0,,"Andrew Lincoln,Norman Reedus,Melissa McBride,L...",2010–2022,completed_series
3,Rick and Morty,"Animation,Adventure,Comedy",9.2,An animated series that follows the exploits o...,414849,23,0,,"Justin Roiland,Chris Parnell,Spencer Grammer,S...",2013–,ongoing_series
4,Army of Thieves,"Action,Crime,Horror",6.9,"A prequel, set before the events of Army of th...",0,0,0,Matthias Schweighöfer,"Matthias Schweighöfer,Nathalie Emmanuel,Ruby O...",2021,completed_movie
