# Initialisation
## Import modules

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import train_test_split

from modules import visumodule # pip install seaborn

from modules import cleanmodule as cm # pip install easymoney


## Import data

In [2]:
df = pd.read_csv("/Users/isar/PycharmProjects/films-data-science/datasets/films-imdb/IMDb movies.csv", low_memory=False)
vm = visumodule.Visualiser(df)
df.head()
df.dtypes

imdb_title_id             object
title                     object
original_title            object
year                      object
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
writer                    object
production_company        object
actors                    object
description               object
avg_vote                 float64
votes                      int64
budget                    object
usa_gross_income          object
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

## Définition succès

In [3]:
df = df[df['metascore'].notna()] # keep rows where metascore not NaN
df['isgood'] = np.where(df['metascore'] > 56, 1, 0)
df['isgood'].value_counts()

1    6682
0    6623
Name: isgood, dtype: int64

## Nettoyage data

In [4]:
print(len(df.index)) # nb rows

df['country'] = df['country'].apply(cm.clean_countries).astype('str')

df['date_published'] = df['date_published'].apply(cm.clean_dates).astype('datetime64')

df['budget'] = df.apply(cm.clean_currency, axis=1, args=["budget"]).astype('float')

df['worlwide_gross_income'] = df.apply(cm.clean_currency, axis=1, args=["worlwide_gross_income"]).astype('float')

#df.loc[df['title'] == "Jeanne"]
df.head()


13305


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,isgood
76,tt0006864,Intolerance,Intolerance: Love's Struggle Throughout the Ages,1916,1918-02-24,"Drama, History",163,USA,English,D.W. Griffith,...,"The story of a poor young woman, separated by ...",7.8,13875,3377076.62,,,99.0,111.0,77.0,1
506,tt0017136,Metropolis,Metropolis,1927,1928-10-01,"Drama, Sci-Fi",153,Germany,German,Fritz Lang,...,In a futuristic city sharply divided between t...,8.3,156076,33691028.3,$ 1236166,11811336.55,98.0,495.0,208.0,1
566,tt0018037,Il cantante di jazz,The Jazz Singer,1927,1928-02-04,"Drama, Music, Musical",88,USA,English,Alan Crosland,...,The son of a Jewish Cantor must defy the tradi...,6.5,8866,3692926.88,,,66.0,104.0,63.0,1
628,tt0018773,Il circo,The Circus,1928,1928-10-01,"Comedy, Romance",72,USA,English,Charles Chaplin,...,The Tramp finds work and the girl of his dream...,8.1,27414,7875910.4,,235542.23,90.0,327.0,65.0,1
714,tt0019777,The Cocoanuts,The Cocoanuts,1929,1929-08-03,"Comedy, Musical",96,USA,English,"Robert Florey, Joseph Santley",...,"During the Florida land boom,",7.0,6900,4375505.78,,,69.0,71.0,43.0,1


## Isolation colonnes pertinentes

In [5]:
tokeep =  ['genre', 'language', 'director', 'duration']

# on trie si il reste de NaN
for tk in tokeep:
    df = df[df[tk].notna()]
    
inputs = df.loc[:, tokeep]
inputs

#vm.distribution("duration")

Unnamed: 0,genre,language,director,duration
76,"Drama, History",English,D.W. Griffith,163
506,"Drama, Sci-Fi",German,Fritz Lang,153
566,"Drama, Music, Musical",English,Alan Crosland,88
628,"Comedy, Romance",English,Charles Chaplin,72
714,"Comedy, Musical",English,"Robert Florey, Joseph Santley",96
...,...,...,...,...
85784,"Action, Crime, Drama",English,Philip Barantini,97
85803,Drama,"Spanish, Catalan",Lucio Castro,84
85827,"Biography, Comedy, Drama",English,Euros Lyn,113
85837,"Action, Crime, Horror",English,Joe Begos,92


## Définition target

In [6]:
target = df["isgood"]

In [7]:
inputs_dict = {}
for tk in tokeep:
    inputs_dict[tk] = LabelEncoder()

In [8]:
# transforme champ (str) en champ_n (int)
for tk in tokeep:
    inputs[tk+"_n"] = inputs_dict[tk].fit_transform(inputs[tk])
inputs.head()

Unnamed: 0,genre,language,director,duration,genre_n,language_n,director_n,duration_n
76,"Drama, History",English,D.W. Griffith,163,377,125,1252,108
506,"Drama, Sci-Fi",German,Fritz Lang,153,418,1163,2013,98
566,"Drama, Music, Musical",English,Alan Crosland,88,397,125,115,33
628,"Comedy, Romance",English,Charles Chaplin,72,293,125,956,17
714,"Comedy, Musical",English,"Robert Florey, Joseph Santley",96,285,125,5139,41


In [9]:
inputs_n = inputs.drop(tokeep,axis="columns")
inputs_n.head()

Unnamed: 0,genre_n,language_n,director_n,duration_n
76,377,125,1252,108
506,418,1163,2013,98
566,397,125,115,33
628,293,125,956,17
714,285,125,5139,41


# Modélisation arbre de décision

## Modélisation sur un set

In [10]:
model = tree.DecisionTreeClassifier()

In [11]:
model.fit(inputs_n,target)

DecisionTreeClassifier()

In [12]:
model.predict([[1,5,2000,1]]) # 1 if is good, 0 if not

array([0])

## Score

In [13]:
nb_true = 0
nb_false = 0
for i in range(0, len(target)):
    inp = inputs_n.iloc[i]
    if target.iloc[i] == model.predict([inp]):
        nb_true+=1
    else:
        nb_false+=1
print(nb_true/(nb_true+nb_false))

# en fait on peut juste faire ça
model.score(inputs_n,target)

0.9995481247175779


0.9995481247175779

## Division en 2 sets

In [14]:
input_train, input_test, target_train, target_test = train_test_split(inputs_n, target)

In [15]:
model_train = tree.DecisionTreeClassifier()
model_train.fit(input_train,target_train)


DecisionTreeClassifier()

In [16]:
model_train.score(input_train, target_train)

0.9996987346856798

In [17]:
model_train.score(input_test, target_test)

0.5876506024096385