In [50]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as mtplt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

In [51]:
movies = pd.read_csv("movies.csv",sep=",", encoding='latin1')

In [52]:
movies.head(5)

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


In [53]:
movies.isna()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6816,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6817,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6818,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


Partie 1 : Diagramme de répartition


In [111]:
sb.set()
ax = sb.distplot(movies[['year']],kde=True)
ax.set(xlabel='year', ylabel='budget')
mtplt.show()



Partie 1: Boite a moustache

In [56]:
graph = mtplt.gcf()
graph.set_size_inches(15,15)
sb.boxplot(x='year',
           y='budget',
           data=movies)
mtplt.show()

In [57]:
mtplt.figure(figsize=(15,12))
mtplt.title('Correlation of Features', fontsize=20)
sb.heatmap(movies.corr().astype(float).corr(),vmax=1.0, annot=True)
mtplt.show()

<IPython.core.display.Javascript object>

Partie 2 : 1er algorithme (arbre de decision)

In [58]:
y = movies["budget"]
X = movies[["year","runtime"]]
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=test_size, random_state=seed)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_train)
print(accuracy_score(y_train, predictions))

0.5158678047712848


Partie 2 2ème algorithme (Regréssion linéaire)

In [110]:
clf= LinearRegression()
test_size = 0.4
seed = 7
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=test_size, random_state=seed)
clf.fit(X_train,y_train)
clf.predict(X_test)
clf.score(X_test,y_test)

0.14555226246036035

Partie 3 : Features Importance (regression Linéaire)

In [108]:
importances = clf.coef_
for i,v in enumerate(importances):
    print('Feature: %0d, Score: %.5f' % (i,v))
mtplt.bar([x for x in range(len(importances))], importances)
mtplt.show()

Feature: 0, Score: 1105232.85479
Feature: 1, Score: 489583.58824


Partie 3: Features Importance(arbes de decision)

In [106]:
importance = model.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
mtplt.bar([x for x in range(len(importance))], importance)
mtplt.show()

Feature: 0, Score: 0.37932
Feature: 1, Score: 0.62068
