In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('Preprocessed_anime_data.csv')
df = df.drop(df.columns[0], axis=1)

In [3]:
df.head()

Unnamed: 0,Title,Synopsis,Rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,ixtl,teamKG.1,ufotable,Type_Movie,Type_Music,Type_ONA,Type_OVA,Type_Special,Type_TV,Type_Unknown
0,0.527119,"[-0.12797664105892181, 0.4972298741340637, 0.3...",8.81,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.365184,"[-0.09161534905433655, 0.4568534791469574, 0.2...",8.41,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0.764957,"[-0.09540949761867523, 0.40506166219711304, 0....",8.31,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0.480045,"[-0.13743984699249268, 0.5658932328224182, 0.3...",7.34,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0.387433,"[-0.09502539783716202, 0.5813391208648682, 0.4...",7.04,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
# test avec cosine similarity

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
synopsis_matrix = vectorizer.fit_transform(df['Synopsis'])

In [6]:
similarity_matrix = cosine_similarity(synopsis_matrix)

In [7]:
similar_movies_indices = {}
for i, row in df.iterrows():
    # Exclure le film lui-même de la liste des films similaires
    similar_indices = similarity_matrix[i].argsort()[::-1][1:]
    similar_movies_indices[i] = similar_indices

In [8]:
predicted_ratings = []
for i, row in df.iterrows():
    similar_indices = similar_movies_indices[i]
    similar_ratings = df.iloc[similar_indices]['Rating']
    predicted_rating = similar_ratings.mean()
    predicted_ratings.append(predicted_rating)
df['Predicted_Rating'] = predicted_ratings

In [9]:
df

Unnamed: 0,Title,Synopsis,Rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,teamKG.1,ufotable,Type_Movie,Type_Music,Type_ONA,Type_OVA,Type_Special,Type_TV,Type_Unknown,Predicted_Rating
0,0.527119,"[-0.12797664105892181, 0.4972298741340637, 0.3...",8.81,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439801
1,0.365184,"[-0.09161534905433655, 0.4568534791469574, 0.2...",8.41,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,6.439841
2,0.764957,"[-0.09540949761867523, 0.40506166219711304, 0....",8.31,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439851
3,0.480045,"[-0.13743984699249268, 0.5658932328224182, 0.3...",7.34,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439948
4,0.387433,"[-0.09502539783716202, 0.5813391208648682, 0.4...",7.04,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6.439978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.388812,"[-0.36991795897483826, 0.9395948052406311, 0.6...",6.89,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,6.439993
9996,0.618888,"[-0.006673065479844809, -0.0015420711133629084...",5.69,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,6.440168
9997,0.350052,"[-0.14526714384555817, 0.5222010016441345, 0.2...",8.29,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439853
9998,0.397307,"[-0.16523534059524536, 0.5655101537704468, 0.2...",7.45,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,6.439937


In [10]:
# we delete nan values

In [11]:
index_with_nan = df.index[df.isnull().any(axis=1)]
index_with_nan
df.drop(index_with_nan,0, inplace=True)

In [12]:
# we split dataset train values/test values

In [13]:
df1=df.drop(['Rating','Synopsis'], axis=1)

In [14]:
X = df1.values
y = df['Rating'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
#Model

In [19]:
rf = RandomForestRegressor(n_estimators=40, random_state=42)

In [20]:
model= rf.fit(X_train,y_train)
y_test_pred = model.predict(X_test)

In [21]:
#Mean absolute error regression loss

In [22]:
from sklearn.metrics import mean_absolute_error

In [23]:
mean_absolute_error(y_test, y_test_pred)

0.038041338560785114

In [24]:
#Mean squared error 

In [25]:
from sklearn.metrics import mean_squared_error

In [26]:
mean_squared_error(y_test, y_test_pred)

0.033435705516822754

In [27]:
#r2 score

In [28]:
from sklearn.metrics import r2_score

In [29]:
print(r2_score(y_test, y_test_pred))

0.9673407479586174


In [30]:
#comparison between 100 values of y_test (initial ratings) and y_test_pred (ratings predictions)

In [31]:
for i in range(100):
    print(y_test[i], y_test_pred[i])

6.52 6.520000000000005
6.08 6.068000000000005
6.16 6.1599999999999975
6.98 6.979999999999999
7.21 7.209999999999999
6.84 6.840000000000001
7.79 7.790000000000002
7.16 7.160000000000001
5.33 5.088250000000002
6.31 6.310000000000001
5.79 5.789999999999998
7.13 7.129999999999997
7.06 7.0600000000000005
7.87 7.870000000000003
5.03 5.03
6.76 6.759999999999996
6.14 6.143999999999993
7.41 7.410000000000001
6.11 6.1100000000000065
5.44 5.439999999999999
6.95 6.949999999999996
7.19 7.1899999999999995
6.36 6.360000000000007
6.9 6.825000000000003
7.11 7.109750000000007
7.64 7.639999999999992
6.19 6.189999999999999
7.54 7.540000000000001
4.45 4.442999999999998
5.59 5.61875
4.72 4.6815
6.99 6.990000000000004
7.46 7.459999999999999
5.97 5.97
5.67 4.559
7.75 7.75
8.37 8.370750000000003
6.78 6.7799999999999985
5.64 5.639999999999994
5.27 5.100250000000001
7.03 7.029499999999997
7.4 7.400000000000001
7.55 7.550000000000006
5.72 5.72
6.5 6.5005
8.1 8.099750000000002
7.22 7.220000000000003
5.69 5.6899999

In [32]:
# Cross Validation to check if there is overfitting or not

In [33]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold

In [34]:
scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(X) :
    print("Index du train : ", train_index, "\n")
    print("Index de test : ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    best_svr.fit(X_train, y_train)
    scores.append(best_svr.score(X_test, y_test))

Index du train :  [   1    2    4 ... 9980 9981 9983] 

Index de test :  [   0    3    8 ... 9982 9984 9985]
Index du train :  [   0    1    2 ... 9983 9984 9985] 

Index de test :  [  26   27   30 ... 9942 9946 9952]
Index du train :  [   0    1    3 ... 9983 9984 9985] 

Index de test :  [   2   15   28 ... 9963 9966 9981]
Index du train :  [   0    1    2 ... 9983 9984 9985] 

Index de test :  [   6    7   16 ... 9977 9978 9980]
Index du train :  [   0    2    3 ... 9982 9984 9985] 

Index de test :  [   1    4    5 ... 9973 9974 9983]


In [35]:
scores

[0.5259743445009101,
 0.49680191729656675,
 0.4965168718011147,
 0.5128931594973318,
 0.5314234199652521]

In [120]:
#save model

In [121]:
import pickle
filename = "model.pkl"

pickle.dump(model, open(filename, "wb"))