In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('scrapperv8.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,actors,date,movie,press_rating,producer,spectators_rating
0,0,Patrick Wilson ; Rose Byrne ; Ty Simpkins ;,15 juin 2011,Insidious,3.2,James Wan,3.5
1,1,Jean-Paul Rouve ; Isabelle Nanty ; Claire Nade...,1 juillet 2011,Les Tuche,2.3,Olivier Baroux,2.6
2,2,Leonardo DiCaprio ; Marion Cotillard ; Ellen P...,21 juillet 2010,Inception,4.1,Christopher Nolan,4.4
3,3,Michael Douglas ; Shia LaBeouf ; Josh Brolin ;,29 septembre 2010,Wall Street : l'argent ne dort jamais,2.3,Oliver Stone,2.6
4,4,Sarah Jessica Parker ; Kim Cattrall ; Kristin ...,2 juin 2010,Sex and the City 2,2.5,Michael Patrick King,2.4


In [4]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,actors,date,movie,press_rating,producer,spectators_rating
0,Patrick Wilson ; Rose Byrne ; Ty Simpkins ;,15 juin 2011,Insidious,3.2,James Wan,3.5
1,Jean-Paul Rouve ; Isabelle Nanty ; Claire Nade...,1 juillet 2011,Les Tuche,2.3,Olivier Baroux,2.6
2,Leonardo DiCaprio ; Marion Cotillard ; Ellen P...,21 juillet 2010,Inception,4.1,Christopher Nolan,4.4
3,Michael Douglas ; Shia LaBeouf ; Josh Brolin ;,29 septembre 2010,Wall Street : l'argent ne dort jamais,2.3,Oliver Stone,2.6
4,Sarah Jessica Parker ; Kim Cattrall ; Kristin ...,2 juin 2010,Sex and the City 2,2.5,Michael Patrick King,2.4


### Notes

Columns analysis:
- movie (= name) not relevant
- date ?
- output is press_rating or spectators_rating

To do:
- Missing values
- Actors and Producer columns transformation (one column by actor/producer with 0/1)

In [5]:
df.describe()

Unnamed: 0,press_rating,spectators_rating
count,384.0,410.0
mean,3.23151,3.57878
std,0.643726,0.63146
min,1.4,1.2
25%,2.8,3.3
50%,3.2,3.7
75%,3.7,4.1
max,4.7,4.6


In [6]:
def get_features(df, column):
    features = []
    for i in range(len(df)):
        line_features = str(df[column][i]).split(' ; ')
        for feature in line_features:
            if feature not in features:
                features.append(feature)
    if '' in features:
        features.remove('')
    return features

In [7]:
actors = get_features(df, 'actors') # 827 actors
producers = get_features(df, 'producer') # 334 producers

In [8]:
def create_binary_columns(df, l, column): # l is the list of features
    for i in range(len(df)):
        try:
            for elt in l:
                if elt in df.loc[i][column]:
                    df.loc[i, elt] = 1
                else:
                    df.loc[i, elt] = 0
        except TypeError:
            df.loc[i, elt] = np.nan
        if i%200 == 0:
            print(df.loc[i])
    return df

In [9]:
create_binary_columns(df, actors, 'actors')

actors                       Patrick Wilson ; Rose Byrne ; Ty Simpkins ; 
date                                                         15 juin 2011
movie                                                           Insidious
press_rating                                                          3.2
producer                                                        James Wan
spectators_rating                                                     3.5
Patrick Wilson                                                          1
Rose Byrne                                                              1
Ty Simpkins                                                             1
Jean-Paul Rouve                                                         0
Isabelle Nanty                                                          0
Claire Nadeau                                                           0
Leonardo DiCaprio                                                       0
Marion Cotillard                      

actors                       Jessica Chastain ; Mark Strong ; Sam Waterston ; 
date                                                               8 mars 2017
movie                                                              Miss Sloane
press_rating                                                               3.2
producer                                                           John Madden
spectators_rating                                                          4.1
Patrick Wilson                                                               0
Rose Byrne                                                                   0
Ty Simpkins                                                                  0
Jean-Paul Rouve                                                              0
Isabelle Nanty                                                               0
Claire Nadeau                                                                0
Leonardo DiCaprio                                   

Unnamed: 0,actors,date,movie,press_rating,producer,spectators_rating,Patrick Wilson,Rose Byrne,Ty Simpkins,Jean-Paul Rouve,...,Soo-an Kim,Yumi Jung,Nicolás Durán,Alejandro Goic,Gastón Salgado,Bárbara Lennie,Ana Wagener,Ross O'Hennessy,Ben Loyd-Holmes,Silvio Simac
0,Patrick Wilson ; Rose Byrne ; Ty Simpkins ;,15 juin 2011,Insidious,3.2,James Wan,3.5,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Jean-Paul Rouve ; Isabelle Nanty ; Claire Nade...,1 juillet 2011,Les Tuche,2.3,Olivier Baroux,2.6,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Leonardo DiCaprio ; Marion Cotillard ; Ellen P...,21 juillet 2010,Inception,4.1,Christopher Nolan,4.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Michael Douglas ; Shia LaBeouf ; Josh Brolin ;,29 septembre 2010,Wall Street : l'argent ne dort jamais,2.3,Oliver Stone,2.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Sarah Jessica Parker ; Kim Cattrall ; Kristin ...,2 juin 2010,Sex and the City 2,2.5,Michael Patrick King,2.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Rémy Girard ; Lubna Azabal ; Mélissa Désormeau...,12 janvier 2011,Incendies,3.8,Denis Villeneuve,4.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Sami Bouajila ; Denis Podalydès ; Maurice Béni...,22 juin 2011,Omar m'a tuer,3.3,Roschdy Zem,3.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Natalie Portman ; Mila Kunis ; Vincent Cassel ;,9 février 2011,Black Swan,4.1,Darren Aronofsky,4.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,François Cluzet ; Marion Cotillard ; Benoît Ma...,20 octobre 2010,Les petits mouchoirs,2.9,Guillaume Canet,3.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Leonardo DiCaprio ; Mark Ruffalo ; Ben Kingsle...,24 février 2010,Shutter Island,3.8,Martin Scorsese,4.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
create_binary_columns(df, producers, 'producer')

actors                       Patrick Wilson ; Rose Byrne ; Ty Simpkins ; 
date                                                         15 juin 2011
movie                                                           Insidious
press_rating                                                          3.2
producer                                                        James Wan
spectators_rating                                                     3.5
Patrick Wilson                                                          1
Rose Byrne                                                              1
Ty Simpkins                                                             1
Jean-Paul Rouve                                                         0
Isabelle Nanty                                                          0
Claire Nadeau                                                           0
Leonardo DiCaprio                                                       0
Marion Cotillard                      

actors                       Jessica Chastain ; Mark Strong ; Sam Waterston ; 
date                                                               8 mars 2017
movie                                                              Miss Sloane
press_rating                                                               3.2
producer                                                           John Madden
spectators_rating                                                          4.1
Patrick Wilson                                                               0
Rose Byrne                                                                   0
Ty Simpkins                                                                  0
Jean-Paul Rouve                                                              0
Isabelle Nanty                                                               0
Claire Nadeau                                                                0
Leonardo DiCaprio                                   

Unnamed: 0,actors,date,movie,press_rating,producer,spectators_rating,Patrick Wilson,Rose Byrne,Ty Simpkins,Jean-Paul Rouve,...,Stéphane De Freitas,Matthias Hoene,John Madden,Daniel Calparsoro,Blandine Lenoir,Sang-Ho Yeon,Bruno Chiche,Fernando Guzzoni,Oriol Paulo,Simon Wells
0,Patrick Wilson ; Rose Byrne ; Ty Simpkins ;,15 juin 2011,Insidious,3.2,James Wan,3.5,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Jean-Paul Rouve ; Isabelle Nanty ; Claire Nade...,1 juillet 2011,Les Tuche,2.3,Olivier Baroux,2.6,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Leonardo DiCaprio ; Marion Cotillard ; Ellen P...,21 juillet 2010,Inception,4.1,Christopher Nolan,4.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Michael Douglas ; Shia LaBeouf ; Josh Brolin ;,29 septembre 2010,Wall Street : l'argent ne dort jamais,2.3,Oliver Stone,2.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Sarah Jessica Parker ; Kim Cattrall ; Kristin ...,2 juin 2010,Sex and the City 2,2.5,Michael Patrick King,2.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Rémy Girard ; Lubna Azabal ; Mélissa Désormeau...,12 janvier 2011,Incendies,3.8,Denis Villeneuve,4.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Sami Bouajila ; Denis Podalydès ; Maurice Béni...,22 juin 2011,Omar m'a tuer,3.3,Roschdy Zem,3.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Natalie Portman ; Mila Kunis ; Vincent Cassel ;,9 février 2011,Black Swan,4.1,Darren Aronofsky,4.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,François Cluzet ; Marion Cotillard ; Benoît Ma...,20 octobre 2010,Les petits mouchoirs,2.9,Guillaume Canet,3.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Leonardo DiCaprio ; Mark Ruffalo ; Ben Kingsle...,24 février 2010,Shutter Island,3.8,Martin Scorsese,4.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.head()

Unnamed: 0,actors,date,movie,press_rating,producer,spectators_rating,Patrick Wilson,Rose Byrne,Ty Simpkins,Jean-Paul Rouve,...,Stéphane De Freitas,Matthias Hoene,John Madden,Daniel Calparsoro,Blandine Lenoir,Sang-Ho Yeon,Bruno Chiche,Fernando Guzzoni,Oriol Paulo,Simon Wells
0,Patrick Wilson ; Rose Byrne ; Ty Simpkins ;,15 juin 2011,Insidious,3.2,James Wan,3.5,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Jean-Paul Rouve ; Isabelle Nanty ; Claire Nade...,1 juillet 2011,Les Tuche,2.3,Olivier Baroux,2.6,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Leonardo DiCaprio ; Marion Cotillard ; Ellen P...,21 juillet 2010,Inception,4.1,Christopher Nolan,4.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Michael Douglas ; Shia LaBeouf ; Josh Brolin ;,29 septembre 2010,Wall Street : l'argent ne dort jamais,2.3,Oliver Stone,2.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Sarah Jessica Parker ; Kim Cattrall ; Kristin ...,2 juin 2010,Sex and the City 2,2.5,Michael Patrick King,2.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df.to_csv('data_processing_2.csv')