# Import everything

In [35]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [27]:
def get_html_from_link(page_link):
    '''
        Get HTML from web page and parse it.

        :param page_link: link of the webpage we want to scrap
        :type page_link: string
        :return: BeautifulSoup object (HTML parsed)
        :rtype: bs4.BeautifulSoup
    '''
   
    response = requests.get(page_link)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [28]:
def get_links_to_movies(root_html):
    '''
        Extract book links from URL_BOOK_LISTE

        :param root_html: BeautifulSoup Element that contains all books links
        :type book_html: bs4.BeautifulSoup
        :return: List of all book links in the page
        :rtype: list(str)
    '''
    movie_links = []

    for element in root_html.find_all('a'):
        try:
            ref = element['href']
        except KeyError:
            print(f"Error with tag a : \n{element} \nit might not contain any href")
        if '/film/' in ref and 'critiques' not in ref and 'seances' not in ref:
            movie_links.append(ref)

    return movie_links

In [29]:
def extract_movie_info(movie_html):
    '''
        Extract book infos from URL BOOK HTML

        :param book_html: BeautifulSoup Element that contains book infos
        :type book_html: bs4.element.Tag
        :return:
            - book_title : title of the book
            - book_image_link: link to the image of the book
        :rtype: tuple(string, string, string)
    '''

    movie_title = movie_html.find('h1', {"itemprop": "name"}).text.strip() #strip enlève les tabulations et les espaces
    
    movie_release_date = movie_html.find('small', {"class": "pvi-product-year"}).text.strip()

    
    if movie_html.find('span', {'class': 'pvi-scrating-value'}) is None:
        movie_rating = 'Non indiqué'
    else:
        movie_rating = movie_html.find('span', {'class': 'pvi-scrating-value'}).text.strip()
        
        
    movie_director = movie_html.find('span', {"itemprop": "director"}).text.strip()
    
    
    if movie_html.find('span', {"itemprop": "genre"}) is None:
        movie_gender = 'Non indiqué'
    else:
        movie_gender = movie_html.find('span', {"itemprop": "genre"}).text.strip()
    
 
    if movie_html.find('span', {'class': "d-offset ecot-contact-label", 'itemprop': "name"}) is None:
        movie_actors = 'Non indiqué'
    else:
        movie_actors = movie_html.find('span', {'class': "d-offset ecot-contact-label", 'itemprop': "name"}).text.strip()
        
    
    return movie_title, movie_release_date, movie_rating, movie_director, movie_gender, movie_actors

In [30]:
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/films/tops/top111/?p={page_number}'

page_number = 1
page_link = f'https://www.senscritique.com/films/tops/top111/?p={page_number}'
print(get_html_from_link(page_link).prettify())

<!DOCTYPE html>
<!--[if IE 9]><html class="lt-ie9" lang="fr"><![endif]-->
<!--[if !IE]><!-->
<html lang="fr">
 <!--<![endif]-->
 <head>
  <title>
   Le Top 111 films - SensCritique
  </title>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).loader_config={licenseKey:"a2ca6cf22e",applicationID:"44844768"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var i=n[t]={exports:{}};e[t][0].call(i.exports,function(n){var i=e[t][1][n];return r(i||n)},i,i.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(e,n,t){function r(){}function i(e,n,t){return function(){return o(e,[u.now()].concat(f(arguments)),n?null:this,t),n?void 0:this}}var o=e("handle"),a=e(4),f=e(5),c=e("ee").get("tracer"),u=e("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addT

In [31]:
html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] #url du site de base + url du premier lien d'un film
print(link)
html = get_html_from_link(link)
print("Exemple sur un premier film: {} \n".format(extract_movie_info(html)))

films = []

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)
    print(result)

Error with tag a : 
<a class="lafo-footer-anchor action-popup-support">Nous contacter</a> 
it might not contain any href
Nombre de films dans cet échantillon: 111

5 premiers liens de films: ['/film/12_hommes_en_colere/370894', '/film/Harakiri/402373', '/film/Barberousse/368097', '/film/Le_Bon_la_Brute_et_le_Truand/368376', '/film/Les_Sept_Samourais/451324']

https://www.senscritique.com//film/12_hommes_en_colere/370894
Exemple sur un premier film: ('12 hommes en colère', '(1957)', '8.7', 'Sidney Lumet', 'Policier', 'Henry Fonda') 

('12 hommes en colère', '(1957)', '8.7', 'Sidney Lumet', 'Policier', 'Henry Fonda')
('Harakiri', '(1962)', '8.6', 'Masaki Kobayashi', 'Action', 'Tatsuya Nakadai')
('Barberousse', '(1965)', '8.6', 'Akira Kurosawa', 'Arts martiaux', 'Toshirō Mifune')
('Le Bon, la Brute et le Truand', '(1966)', '8.5', 'Sergio Leone', 'Western', 'Clint Eastwood')
('Les Sept Samouraïs', '(1954)', '8.5', 'Akira Kurosawa', 'Arts martiaux', 'Toshirō Mifune')
('Il était une fois en 

('Whiplash', '(2014)', '8.1', 'Damien Chazelle', 'Drame', 'Miles Teller')
('La Liste de Schindler', '(1993)', '8.1', 'Steven Spielberg', 'Biopic', 'Liam Neeson')
("L'Homme qui tua Liberty Valance", '(1962)', '8.1', 'John Ford', 'Western', 'John Wayne')
("L'Opérateur", '(1928)', '8.1', 'Buster Keaton', 'Comédie', 'Buster Keaton')
('Elephant Man', '(1980)', '8.0', 'David Lynch', 'Biopic', 'Anthony Hopkins')
('Freaks - La Monstrueuse Parade', '(1932)', '8.0', 'Tod Browning', 'Drame', 'Wallace Ford')
('Citizen Kane', '(1941)', '8.0', 'Orson Welles', 'Drame', 'Joseph Cotten')
('Paris, Texas', '(1984)', '8.0', 'Wim Wenders', 'Drame', 'Harry Dean Stanton')
('Voyage à Tokyo', '(1953)', '8.0', 'Yasujirō Ozu', 'Drame', 'Chishû Ryû')
("L'Aventure de Mme Muir", '(1947)', '8.0', 'Joseph L. Mankiewicz', 'Drame', 'Gene Tierney')
('La Garçonnière', '(1960)', '8.0', 'Billy Wilder', 'Comédie', 'Jack Lemmon')
('Baraka', '(1992)', '8.0', 'Ron Fricke', 'Musique', 'Non indiqué')
('Mr. Smith au Sénat', '(193

In [32]:
#Deuxième ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2019/2301802/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2019/2301802/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0]

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #161

#Troisième ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2020/2582670/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2020/2582670/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #211

#Quatrième ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2010/748463/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2010/748463/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)
print(len(films)) #261

#5e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2009/748526/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2009/748526/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #311

#6e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_des_annees_1980/558507/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_des_annees_1980/558507/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #361

#7e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_films_aux_meilleures_bandes_originales/511084/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_films_aux_meilleures_bandes_originales/511084/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #411

#8e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_tires_d_une_histoire_vraie/575015/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_tires_d_une_histoire_vraie/575015/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #496

#9e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_1995/748732/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_1995/748732/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #546

#10e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2005/748645/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2005/748645/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #596

#11e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2001/748669/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2001/748669/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #646

#12e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_1993/748746/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_1993/748746/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #696

#13e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2017/1522840/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2017/1522840/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #746

#14e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_de_2011/748438/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_de_2011/748438/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #796

#15e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_des_annees_1960/558502/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_des_annees_1960/558502/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #846

#16e ajout de données
URL_MOVIE_BASE = 'https://www.senscritique.com/'
URL_MOVIE_LISTE = '/top/resultats/Les_meilleurs_films_des_annees_1980/558507/?p={page_number}'
page_number = 1
page_link = f'https://www.senscritique.com/top/resultats/Les_meilleurs_films_des_annees_1980/558507/?p={page_number}'

html = get_html_from_link(page_link)
movie_links = get_links_to_movies(html)
print("Nombre de films dans cet échantillon: {}\n".format(len(movie_links)))
print("5 premiers liens de films: {}\n".format(movie_links[0:5]))

link = URL_MOVIE_BASE + movie_links[0] 

for link in movie_links:
    new_link = URL_MOVIE_BASE + link
    html = get_html_from_link(new_link)
    result = extract_movie_info(html)
    films.append(result)

print(len(films)) #896

Error with tag a : 
<a class="lafo-footer-anchor action-popup-support">Nous contacter</a> 
it might not contain any href
Nombre de films dans cet échantillon: 50

5 premiers liens de films: ['/film/Parasite/25357970', '/film/Joker/27059297', '/film/Once_Upon_a_Time_in_Hollywood/26144468', '/film/Green_Book_Sur_les_routes_du_sud/28822859', '/film/Les_Miserables/36300047']

161
Error with tag a : 
<a class="lafo-footer-anchor action-popup-support">Nous contacter</a> 
it might not contain any href
Nombre de films dans cet échantillon: 50

5 premiers liens de films: ['/film/1917/35286461', '/film/Jojo_Rabbit/31516630', '/film/The_Gentlemen/32912175', '/film/Uncut_Gems/26183649', '/film/Le_Cas_Richard_Jewell/10870556']

211
Error with tag a : 
<a class="lafo-footer-anchor action-popup-support">Nous contacter</a> 
it might not contain any href
Nombre de films dans cet échantillon: 50

5 premiers liens de films: ['/film/Inception/471143', '/film/Shutter_Island/405140', '/film/The_Social_Netwo

# Création de la dataframe

In [36]:
movies = pd.DataFrame(data=films, columns=['Title', 'Release date', 'Rating', 'Director', 'Gender', 'Principal actors'])

In [None]:
movies.to_excel("ProjetMovies.xlsx") 

In [37]:
movies

Unnamed: 0,Title,Release date,Rating,Director,Gender,Principal actors
0,12 hommes en colère,(1957),8.7,Sidney Lumet,Policier,Henry Fonda
1,Harakiri,(1962),8.6,Masaki Kobayashi,Action,Tatsuya Nakadai
2,Barberousse,(1965),8.6,Akira Kurosawa,Arts martiaux,Toshirō Mifune
3,"Le Bon, la Brute et le Truand",(1966),8.5,Sergio Leone,Western,Clint Eastwood
4,Les Sept Samouraïs,(1954),8.5,Akira Kurosawa,Arts martiaux,Toshirō Mifune
...,...,...,...,...,...,...
891,Blow Out,(1981),7.6,Brian De Palma,Thriller,John Travolta
892,Pink Floyd : The Wall,(1982),7.8,Alan Parker,Drame,Bob Geldof
893,Predator,(1987),7.1,John McTiernan,Action,Arnold Schwarzenegger
894,S.O.S. Fantômes,(1984),7.1,Ivan Reitman,Comédie,Bill Murray


# CountVectorizer

In [40]:
movies_shuffled = shuffle(movies)

In [41]:
df_train, df_test = train_test_split(movies_shuffled, test_size=0.2)

In [42]:
df_train.shape

(716, 6)

In [43]:
df_test.shape

(180, 6)

In [44]:
cv = CountVectorizer(ngram_range=(2, 2))

In [45]:
new_df = cv.fit_transform(df_train['Principal actors'])

In [46]:
new_df = pd.DataFrame(new_df.toarray(), columns=cv.get_feature_names())

In [47]:
new_df

Unnamed: 0,adam butcher,adam sandler,adrien brody,ahn seo,al kateab,al pacino,alain delon,aleksey kravchenko,alex hibbert,alexandre rodrigues,...,william holden,woody allen,yul brynner,yun jung,yves montand,yôko honna,zach galligan,zhang yu,zoé héran,éric elmosnino
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
712,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
cv.fit(df_train['Principal actors'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(2, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [49]:
train_vectorized = cv.transform(df_train['Principal actors'])
test_vectorized = cv.transform(df_test['Principal actors'])

In [50]:
train_vectorized

<716x481 sparse matrix of type '<class 'numpy.int64'>'
	with 775 stored elements in Compressed Sparse Row format>

In [51]:
test_vectorized

<180x481 sparse matrix of type '<class 'numpy.int64'>'
	with 108 stored elements in Compressed Sparse Row format>

In [52]:
X_train = train_vectorized
X_test  = test_vectorized
y_train = df_train["Rating"]
y_test = df_test["Rating"]

In [53]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=1)

# Fit logreg to the training set
logreg.fit(X_train, y_train)

# Predict test set labels
y_pred=logreg.predict(X_train)

In [54]:
logreg.score(X_train, y_train)

0.6773743016759777

In [55]:
logreg.score(X_test, y_test)

0.21666666666666667

# TF-IDF

In [56]:
tf_idf = TfidfVectorizer(ngram_range=(2, 2))

In [57]:
tf_idf.fit(df_train["Principal actors"])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(2, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [58]:
train_vectorized = tf_idf.transform(df_train["Principal actors"])
test_vectorized = tf_idf.transform(df_test["Principal actors"])

In [59]:
train_vectorized

<716x481 sparse matrix of type '<class 'numpy.float64'>'
	with 775 stored elements in Compressed Sparse Row format>

In [60]:
test_vectorized

<180x481 sparse matrix of type '<class 'numpy.float64'>'
	with 108 stored elements in Compressed Sparse Row format>

In [61]:
X_train_tf = train_vectorized
X_test_tf  = test_vectorized
y_train_tf = df_train["Rating"]
y_test_tf = df_test["Rating"]

In [62]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=1)

# Fit logreg to the training set
logreg.fit(X_train_tf, y_train_tf)

# Predict test set labels
y_pred=logreg.predict(X_train_tf)

In [63]:
logreg.score(X_train_tf, y_train_tf)

0.6745810055865922

In [64]:
logreg.score(X_test_tf, y_test_tf)

0.21666666666666667

# Dummies

In [65]:
df = pd.DataFrame(data=films, columns=['Title', 'Release date', 'Rating', 'Director', 'Gender', 'Principal actor'])

In [66]:
df = shuffle(df)

In [67]:
df1 = df.drop('Rating', axis=1).drop('Release date', axis=1).drop('Title', axis=1)

In [68]:
df1

Unnamed: 0,Director,Gender,Principal actor
79,Frédéric Rossif,Historique,Philippe Meyer
127,Adam McKay,Biopic,Christian Bale
585,Joe Wright,Drame,Keira Knightley
63,Henri-Georges Clouzot,Drame,Brigitte Bardot
64,Sidney Lumet,Drame,Sean Connery
...,...,...,...
366,Nicolas Winding Refn,Drame,Ryan Gosling
394,Sofia Coppola,Drame,James Woods
561,Shane Black,Action,Robert Downey Jr.
559,Nick Park,Animation,Jean-loup Horwitz


In [69]:
notes = df['Rating']

In [70]:
notes

79     8.1
127    7.1
585    6.7
63     8.1
64     8.1
      ... 
366    7.5
394    7.2
561    7.1
559    7.0
407    8.0
Name: Rating, Length: 896, dtype: object

In [71]:
df1_dummies = pd.get_dummies(df1)

In [72]:
df1_dummies

Unnamed: 0,Director_Abel Lanzac (Antonin Baudry),Director_Adam Elliot,Director_Adam McKay,Director_Adil El Arbi,Director_Akira Kurosawa,Director_Alain Resnais,Director_Alan J. Pakula,Director_Alan Parker,Director_Albert Dupontel,Director_Albert Hughes,...,Principal actor_Woody Allen,Principal actor_Yul Brynner,Principal actor_Yun Jung-Hee,Principal actor_Yuri Solomin,Principal actor_Yves Montand,Principal actor_Yôko Honna,Principal actor_Zach Galligan,Principal actor_Zhang Yu (7),Principal actor_Zoé Héran,Principal actor_Éric Elmosnino
79,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
127,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
394,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
X_train, X_test = train_test_split(df1_dummies, test_size=0.2)
y_train, y_test = train_test_split(notes, test_size=0.2)

In [82]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=1)

# Fit logreg to the training set
logreg.fit(X_train, y_train)

# Predict test set labels
y_pred=logreg.predict(X_train)

In [83]:
logreg.score(X_train, y_train)

0.7667597765363129

In [84]:
logreg.score(X_test, y_test)

0.06111111111111111