# <h1 align="center">Data Science applied to Movies Ratings</h1>
# <h2 align="center">**By: Juan Carlos Abril Ramírez**</h2>
# <h3 align="center">*Date: August xx of 2022*</h3>

---



*First, we import the neccesary libraries for data cleansing and ETL*

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

*Now, we proceed to call the CSV file, stored in my Repository. Then, we visualize the first 5 rows of the newly created table*

In [None]:
imdb_movies = pd.read_csv("imdb_top_1000.csv")

imdb_movies.head(5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


*We then check the number of rows (should be 1000, since it's a top 1000!) and columns (should be the same as displayed in the output above)*

In [None]:
imdb_movies.shape

(1000, 16)

*We then check the full info of the dataframe, to make a choice on what to remove or what to use. Notice that there's many lost or null values in some things*

In [None]:
imdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


*Now we can visualize the statistical description of every column*

In [None]:
imdb_movies.describe(include='all')

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
count,1000,1000,1000.0,899,1000,1000,1000.0,1000,843.0,1000,1000,1000,1000,1000,1000.0,831.0
unique,1000,999,100.0,16,140,202,,1000,,548,660,841,891,939,,823.0
top,https://m.media-amazon.com/images/M/MV5BMDFkYT...,Drishyam,2014.0,U,100 min,Drama,,Two imprisoned men bond over a number of years...,,Alfred Hitchcock,Tom Hanks,Emma Watson,Rupert Grint,Michael Caine,,4360000.0
freq,1,2,32.0,234,23,85,,1,,14,12,7,5,4,,5.0
mean,,,,,,,7.9493,,77.97153,,,,,,273692.9,
std,,,,,,,0.275491,,12.376099,,,,,,327372.7,
min,,,,,,,7.6,,28.0,,,,,,25088.0,
25%,,,,,,,7.7,,70.0,,,,,,55526.25,
50%,,,,,,,7.9,,79.0,,,,,,138548.5,
75%,,,,,,,8.1,,87.0,,,,,,374161.2,


*We could use the Released Year column values as possibly one of the predictors, so we can cluster it into groups*

In [None]:
imdb_movies["Genre"].value_counts().to_frame()

Unnamed: 0,Genre
Drama,85
"Drama, Romance",37
"Comedy, Drama",35
"Comedy, Drama, Romance",31
"Action, Crime, Drama",30
...,...
"Adventure, Thriller",1
"Animation, Action, Sci-Fi",1
"Action, Crime, Comedy",1
"Animation, Crime, Mystery",1


In [None]:
#unique_genres=pd.unique(imdb_movies["Genre"].str.split(", ", expand=True).stack())
#pd.DataFrame(unique_genres,columns=["Genre"])

imdb_movies['Genre']=imdb_movies['Genre'].str.split(', ')
Average_Score_per_Genre = (imdb_movies.explode('Genre')
        .groupby('Genre')[['Series_Title', 'IMDB_Rating']]
        .agg({'Series_Title':'size', 'IMDB_Rating':'mean'})
      )
print(Average_Score_per_Genre)

           Series_Title  IMDB_Rating
Genre                               
Action              189     7.948677
Adventure           196     7.952041
Animation            82     7.930488
Biography           109     7.935780
Comedy              233     7.903433
Crime               209     7.954545
Drama               724     7.959392
Family               56     7.912500
Fantasy              66     7.931818
Film-Noir            19     7.989474
History              56     7.953571
Horror               32     7.887500
Music                35     7.914286
Musical              17     7.947059
Mystery              99     7.967677
Romance             125     7.925600
Sci-Fi               67     7.977612
Sport                19     7.926316
Thriller            137     7.909489
War                  51     8.013725
Western              20     8.000000


In [None]:
imdb_movies_1=imdb_movies.explode('Genre')
imdb_movies_1

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Crime,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Drama,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Action,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Crime,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,Drama,7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,War,7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
999,https://m.media-amazon.com/images/M/MV5BMTY5OD...,The 39 Steps,1935,,86 min,Crime,7.6,A man in London tries to help a counter-espion...,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,
999,https://m.media-amazon.com/images/M/MV5BMTY5OD...,The 39 Steps,1935,,86 min,Mystery,7.6,A man in London tries to help a counter-espion...,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,


*We now change one of the column titles to make it more concrete, from Series Title to Movie Title*

In [None]:
column_names = imdb_movies_1.columns.values
column_names[1] = "Movie Title"
imdb_movies_1.columns = column_names
imdb_movies_1

Unnamed: 0,Poster_Link,Movie Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Crime,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,Drama,9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Action,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,Crime,9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,Drama,7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97 min,War,7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,
999,https://m.media-amazon.com/images/M/MV5BMTY5OD...,The 39 Steps,1935,,86 min,Crime,7.6,A man in London tries to help a counter-espion...,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,
999,https://m.media-amazon.com/images/M/MV5BMTY5OD...,The 39 Steps,1935,,86 min,Mystery,7.6,A man in London tries to help a counter-espion...,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,


*Now we clean all of the column names and remove the unneccesary columns that may not be relevant to pinpoint the goal*

In [None]:
imdb_movies_1.columns = imdb_movies_1.columns.str.replace("_", " ")
imdb_movies_2=imdb_movies_1.drop(["Poster Link", "Certificate", "Overview", "Meta score", "Gross"], axis=1)
imdb_movies_2

Unnamed: 0,Movie Title,Released Year,Runtime,Genre,IMDB Rating,Director,Star1,Star2,Star3,Star4,No of Votes
0,The Shawshank Redemption,1994,142 min,Drama,9.3,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110
1,The Godfather,1972,175 min,Crime,9.2,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367
1,The Godfather,1972,175 min,Drama,9.2,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367
2,The Dark Knight,2008,152 min,Action,9.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232
2,The Dark Knight,2008,152 min,Crime,9.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232
...,...,...,...,...,...,...,...,...,...,...,...
998,Lifeboat,1944,97 min,Drama,7.6,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471
998,Lifeboat,1944,97 min,War,7.6,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471
999,The 39 Steps,1935,86 min,Crime,7.6,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853
999,The 39 Steps,1935,86 min,Mystery,7.6,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853


*Now we reuse the value_counts for the now splitted Genre column attributes, to determine the filtering point*

In [None]:
imdb_movies_2["Genre"].value_counts().to_frame()

Unnamed: 0,Genre
Drama,724
Comedy,233
Crime,209
Adventure,196
Action,189
Thriller,137
Romance,125
Biography,109
Mystery,99
Animation,82


*We now filter the movies by it's genre count. If we want to determine how many movies tend to have higher scores, by removing genres with counts lower than 200, we can filter it using index values and sorting it to a new list*

In [None]:
Genre_count = imdb_movies_2["Genre"].value_counts()
new_Genre_count = Genre_count > 200

filtered_genres = list(np.array(Genre_count.index.values)[np.array(new_Genre_count)])

In [None]:
rows_before = imdb_movies_2.shape[0] # number of rows of original dataframe
print("Number of rows of original dataframe is {}.".format(rows_before))

imdb_movies_3 = imdb_movies_2.loc[imdb_movies_2['Genre'].isin(filtered_genres)]

rows_after = imdb_movies_3.shape[0] # number of rows of processed dataframe
print("Number of rows of processed dataframe is {}.".format(rows_after))

print("{} rows removed!".format(rows_before - rows_after))

Number of rows of original dataframe is 2541.
Number of rows of processed dataframe is 1166.
1375 rows removed!


*Another filter we can use, to test the model, is that we only take movies that have IMDB Ratings higher than 8.0*

In [None]:
imdb_movies_4=imdb_movies_3[imdb_movies_3['IMDB Rating'] > 8.0]

In [None]:
rows_before = imdb_movies_3.shape[0] # number of rows of original dataframe
print("Number of rows of original dataframe is {}.".format(rows_before))

rows_after = imdb_movies_4.shape[0] # number of rows of processed dataframe
print("Number of rows of processed dataframe is {}.".format(rows_after))

print("{} rows removed!".format(rows_before - rows_after))

Number of rows of original dataframe is 1166.
Number of rows of processed dataframe is 380.
786 rows removed!


*We now merge the new data set, to remove duplicates (I made that by grouping the genres), and now we have a very selective list of movies, with the genres and rating higher than average. Then, we organize the dataframe by descending scores, just for fun!*

In [None]:
imdb_movies_5 = imdb_movies_4.groupby(['Movie Title','Released Year','Runtime','IMDB Rating','Director','Star1','Star2','Star3','Star4','No of Votes'])['Genre'].apply(', '.join).reset_index()
imdb_movies_5.sort_values('IMDB Rating', ascending=False)


Unnamed: 0,Movie Title,Released Year,Runtime,IMDB Rating,Director,Star1,Star2,Star3,Star4,No of Votes,Genre
245,The Shawshank Redemption,1994,142 min,9.3,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,Drama
226,The Godfather,1972,175 min,9.2,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,"Crime, Drama"
0,12 Angry Men,1957,96 min,9.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,"Crime, Drama"
221,The Dark Knight,2008,152 min,9.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,"Crime, Drama"
227,The Godfather: Part II,1974,202 min,9.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,"Crime, Drama"
...,...,...,...,...,...,...,...,...,...,...,...
162,PK,2014,153 min,8.1,Rajkumar Hirani,Aamir Khan,Anushka Sharma,Sanjay Dutt,Boman Irani,163061,"Comedy, Drama"
165,Paper Moon,1973,102 min,8.1,Peter Bogdanovich,Ryan O'Neal,Tatum O'Neal,Madeline Kahn,John Hillerman,42285,"Comedy, Crime, Drama"
166,"Paris, Texas",1984,145 min,8.1,Wim Wenders,Harry Dean Stanton,Nastassja Kinski,Dean Stockwell,Aurore Clément,91188,Drama
56,Dilwale Dulhania Le Jayenge,1995,189 min,8.1,Aditya Chopra,Shah Rukh Khan,Kajol,Amrish Puri,Farida Jalal,63516,Drama


In [None]:
imdb_movies_6 = imdb_movies_5.assign(Stars = lambda x: imdb_movies_5['Star1'] + ', ' + imdb_movies_5['Star2'] + ', ' + imdb_movies_5['Star3'] + ', ' + imdb_movies_5['Star4'])

In [None]:
imdb_movies_7=imdb_movies_6.drop(['Star1','Star2','Star3','Star4','No of Votes'], axis=1)
imdb_movies_7.sort_values('IMDB Rating', ascending=False)

Unnamed: 0,Movie Title,Released Year,Runtime,IMDB Rating,Director,Genre,Stars
245,The Shawshank Redemption,1994,142 min,9.3,Frank Darabont,Drama,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
226,The Godfather,1972,175 min,9.2,Francis Ford Coppola,"Crime, Drama","Marlon Brando, Al Pacino, James Caan, Diane Ke..."
0,12 Angry Men,1957,96 min,9.0,Sidney Lumet,"Crime, Drama","Henry Fonda, Lee J. Cobb, Martin Balsam, John ..."
221,The Dark Knight,2008,152 min,9.0,Christopher Nolan,"Crime, Drama","Christian Bale, Heath Ledger, Aaron Eckhart, M..."
227,The Godfather: Part II,1974,202 min,9.0,Francis Ford Coppola,"Crime, Drama","Al Pacino, Robert De Niro, Robert Duvall, Dian..."
...,...,...,...,...,...,...,...
162,PK,2014,153 min,8.1,Rajkumar Hirani,"Comedy, Drama","Aamir Khan, Anushka Sharma, Sanjay Dutt, Boman..."
165,Paper Moon,1973,102 min,8.1,Peter Bogdanovich,"Comedy, Crime, Drama","Ryan O'Neal, Tatum O'Neal, Madeline Kahn, John..."
166,"Paris, Texas",1984,145 min,8.1,Wim Wenders,Drama,"Harry Dean Stanton, Nastassja Kinski, Dean Sto..."
56,Dilwale Dulhania Le Jayenge,1995,189 min,8.1,Aditya Chopra,Drama,"Shah Rukh Khan, Kajol, Amrish Puri, Farida Jalal"


In [None]:
imdb_movies_7.describe(include='all')

Unnamed: 0,Movie Title,Released Year,Runtime,IMDB Rating,Director,Genre,Stars
count,282,282.0,282,282.0,282,282,282
unique,281,83.0,110,,192,7,282
top,Drishyam,2014.0,130 min,,Akira Kurosawa,Drama,"Henry Fonda, Lee J. Cobb, Martin Balsam, John ..."
freq,2,10.0,10,,9,164,1
mean,,,,8.258156,,,
std,,,,0.210796,,,
min,,,,8.1,,,
25%,,,,8.1,,,
50%,,,,8.2,,,
75%,,,,8.3,,,


In [None]:
imdb_movies_7.to_csv('imdb_filtered_dataframe.csv')