In [1]:
# Importation des librairies
import pandas as pd

In [2]:
# Lecture du fichier csv
df = pd.read_csv('../data/movies.csv')
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [3]:
# Affichage des informations sur le dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [4]:
# Affichage des valeurs manquantes
df.isna().sum()

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

In [5]:
# Suppression des valeurs manquantes
df.dropna(inplace=True)

In [6]:
# Affichage des valeurs manquantes
df.isna().sum()

name        0
rating      0
genre       0
year        0
released    0
score       0
votes       0
director    0
writer      0
star        0
country     0
budget      0
gross       0
company     0
runtime     0
dtype: int64

In [7]:
# Afficher les dimensions du dataframe
df.shape

(5421, 15)

In [8]:
# Créer les nouvelles colonnes avec une expression régulière
df[['date_of_release', 'country_of_release']] = df['released'].str.extract(r'(\w+ \d+, \d+) \(([^)]+)\)')

# Afficher le résultat
df[['released', 'date_of_release', 'country_of_release']].head()


Unnamed: 0,released,date_of_release,country_of_release
0,"June 13, 1980 (United States)","June 13, 1980",United States
1,"July 2, 1980 (United States)","July 2, 1980",United States
2,"June 20, 1980 (United States)","June 20, 1980",United States
3,"July 2, 1980 (United States)","July 2, 1980",United States
4,"July 25, 1980 (United States)","July 25, 1980",United States


In [9]:
# Afficher les types des colonnes
df[['released', 'date_of_release', 'country_of_release']].dtypes

released              object
date_of_release       object
country_of_release    object
dtype: object

In [10]:
# Supprimer les colonnes 'released' et 'date_of_release'
df.drop(['released', 'date_of_release'], axis=1, inplace=True)

In [11]:
# Afficher les donnés manquantes
df.isna().sum()

name                   0
rating                 0
genre                  0
year                   0
score                  0
votes                  0
director               0
writer                 0
star                   0
country                0
budget                 0
gross                  0
company                0
runtime                0
country_of_release    14
dtype: int64

In [12]:
# Supprimer les valeurs manquantes
df.dropna(inplace=True)

In [13]:
# Afficher les dimensions du dataframe
df.shape

(5407, 15)

In [15]:
# Choix de colonnes 
df = df[[
    'name',
    'genre',
    'year',
    'director',
    'writer',
    'star',
    'company',
    'country_of_release',
]]

In [16]:
df['year'] = df['year'].astype('str')
df.dtypes

name                  object
genre                 object
year                  object
director              object
writer                object
star                  object
company               object
country_of_release    object
dtype: object

In [17]:
# Création de la colonne "cat_features" qui combine tous les mots de toutes les autres colonnes :


'''
df['cat_features'] = df['name'] + ' ' + df['genre'] + ' ' + df['year'] + \
                     ' ' + df['director'] + ' ' + df['writer'] + ' ' + \
                     df['star'] + ' ' + df['company'] + ' ' + df['country_of_release']
'''

df['cat_features'] = df[df.columns].apply(lambda x: ' '.join(x), axis=1)

df.head()

Unnamed: 0,name,genre,year,director,writer,star,company,country_of_release,cat_features
0,The Shining,Drama,1980,Stanley Kubrick,Stephen King,Jack Nicholson,Warner Bros.,United States,The Shining Drama 1980 Stanley Kubrick Stephen...
1,The Blue Lagoon,Adventure,1980,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,Columbia Pictures,United States,The Blue Lagoon Adventure 1980 Randal Kleiser ...
2,Star Wars: Episode V - The Empire Strikes Back,Action,1980,Irvin Kershner,Leigh Brackett,Mark Hamill,Lucasfilm,United States,Star Wars: Episode V - The Empire Strikes Back...
3,Airplane!,Comedy,1980,Jim Abrahams,Jim Abrahams,Robert Hays,Paramount Pictures,United States,Airplane! Comedy 1980 Jim Abrahams Jim Abraham...
4,Caddyshack,Comedy,1980,Harold Ramis,Brian Doyle-Murray,Chevy Chase,Orion Pictures,United States,Caddyshack Comedy 1980 Harold Ramis Brian Doyl...


In [18]:
# Vectorisation de la colonne "cat_features" 
from sklearn.feature_extraction.text import CountVectorizer 

# Création d'un objet CountVectorizer 
vectorizer = CountVectorizer(stop_words='english', min_df=20)
word_matrix = vectorizer.fit_transform(df['cat_features'])
word_matrix.shape

(5407, 434)

In [19]:
# Afficher le type de word_matrix
type(word_matrix)

scipy.sparse._csr.csr_matrix

In [20]:
# Importation de la fonction cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
 
 # Calcul de la similarité cosinus entre les mots de word_matrix 
sim = cosine_similarity(word_matrix)

# Afficher le type de sim
type(sim)

numpy.ndarray

In [21]:
# Afficher les dimensions de sim
sim

array([[1.        , 0.3354102 , 0.35856858, ..., 0.38138504, 0.19069252,
        0.31622777],
       [0.3354102 , 1.        , 0.40089186, ..., 0.42640143, 0.31980107,
        0.35355339],
       [0.35856858, 0.40089186, 1.        , ..., 0.22792115, 0.22792115,
        0.56694671],
       ...,
       [0.38138504, 0.42640143, 0.22792115, ..., 1.        , 0.27272727,
        0.30151134],
       [0.19069252, 0.31980107, 0.22792115, ..., 0.27272727, 1.        ,
        0.30151134],
       [0.31622777, 0.35355339, 0.56694671, ..., 0.30151134, 0.30151134,
        1.        ]])

In [22]:
# Sauvegarde de la matrice de similarité précalculée
import numpy as np
np.save('../Models/Movies_final_model.npy', sim)