In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('Preprocessed_anime_data.csv')
df = df.drop(df.columns[0], axis=1)

In [3]:
df.head()

Unnamed: 0,Title,Synopsis,Rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,ixtl,teamKG.1,ufotable,Type_Movie,Type_Music,Type_ONA,Type_OVA,Type_Special,Type_TV,Type_Unknown
0,0.527119,"[-0.12797664105892181, 0.4972298741340637, 0.3...",8.81,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.365184,"[-0.09161534905433655, 0.4568534791469574, 0.2...",8.41,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0.764957,"[-0.09540949761867523, 0.40506166219711304, 0....",8.31,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0.480045,"[-0.13743984699249268, 0.5658932328224182, 0.3...",7.34,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0.387433,"[-0.09502539783716202, 0.5813391208648682, 0.4...",7.04,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [92]:
# test avec cosine similarity

In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
synopsis_matrix = vectorizer.fit_transform(df['Synopsis'])

In [5]:
similarity_matrix = cosine_similarity(synopsis_matrix)

In [6]:
similar_movies_indices = {}
for i, row in df.iterrows():
    # Exclure le film lui-même de la liste des films similaires
    similar_indices = similarity_matrix[i].argsort()[::-1][1:]
    similar_movies_indices[i] = similar_indices

In [7]:
predicted_ratings = []
for i, row in df.iterrows():
    similar_indices = similar_movies_indices[i]
    similar_ratings = df.iloc[similar_indices]['Rating']
    predicted_rating = similar_ratings.mean()
    predicted_ratings.append(predicted_rating)
df['Predicted_Rating'] = predicted_ratings

In [13]:
df

Unnamed: 0,Title,Synopsis,Rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,...,teamKG.1,ufotable,Type_Movie,Type_Music,Type_ONA,Type_OVA,Type_Special,Type_TV,Type_Unknown,Predicted_Rating
0,0.527119,"[-0.12797664105892181, 0.4972298741340637, 0.3...",8.81,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439801
1,0.365184,"[-0.09161534905433655, 0.4568534791469574, 0.2...",8.41,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,6.439841
2,0.764957,"[-0.09540949761867523, 0.40506166219711304, 0....",8.31,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439851
3,0.480045,"[-0.13743984699249268, 0.5658932328224182, 0.3...",7.34,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439948
4,0.387433,"[-0.09502539783716202, 0.5813391208648682, 0.4...",7.04,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6.439978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.388812,"[-0.36991795897483826, 0.9395948052406311, 0.6...",6.89,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,6.439993
9996,0.618888,"[-0.006673065479844809, -0.0015420711133629084...",5.69,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,6.440168
9997,0.350052,"[-0.14526714384555817, 0.5222010016441345, 0.2...",8.29,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,6.439853
9998,0.397307,"[-0.16523534059524536, 0.5655101537704468, 0.2...",7.45,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,6.439937


In [None]:
# we delete nan values

In [14]:
index_with_nan = df.index[df.isnull().any(axis=1)]
index_with_nan
df.drop(index_with_nan,0, inplace=True)

In [None]:
# we split dataset train values/test values

In [90]:
X = df.drop('Rating', axis=1).values
X = df.drop('Synopsis', axis=1).values
y = df['Rating'].values

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
#Model

In [64]:
rf = RandomForestRegressor(n_estimators=40, random_state=42)

In [65]:
model= rf.fit(X_train,y_train)
y_test_pred = model.predict(X_test)

In [66]:
#Mean absolute error regression loss

In [67]:
from sklearn.metrics import mean_absolute_error

In [68]:
mean_absolute_error(y_test, y_test_pred)

0.00042655155155416904

In [69]:
#Mean squared error 

In [70]:
from sklearn.metrics import mean_squared_error

In [71]:
mean_squared_error(y_test, y_test_pred)

9.147366116116013e-06

In [72]:
#r2 score

In [73]:
from sklearn.metrics import r2_score

In [74]:
print(r2_score(y_test, y_test_pred))

0.9999910650566248


In [75]:
#comparison between 100 values of y_test (initial ratings) and y_test_pred (ratings predictions)

In [76]:
for i in range(100):
    print(y_test[i], y_test_pred[i])

6.52 6.520000000000005
6.08 6.080000000000005
6.16 6.1599999999999975
6.98 6.979999999999999
7.21 7.210000000000001
6.84 6.840000000000001
7.79 7.790000000000002
7.16 7.160000000000001
5.33 5.330000000000004
6.31 6.310000000000001
5.79 5.789999999999998
7.13 7.129999999999997
7.06 7.0600000000000005
7.87 7.870000000000003
5.03 5.03
6.76 6.759999999999996
6.14 6.1399999999999935
7.41 7.410000000000001
6.11 6.1100000000000065
5.44 5.439999999999999
6.95 6.949999999999994
7.19 7.1899999999999995
6.36 6.360000000000007
6.9 6.900000000000003
7.11 7.1100000000000065
7.64 7.639999999999992
6.19 6.189999999999999
7.54 7.540000000000001
4.45 4.450499999999998
5.59 5.590000000000002
4.72 4.719999999999998
6.99 6.990000000000004
7.46 7.459999999999999
5.97 5.97
5.67 5.6699999999999955
7.75 7.75
8.37 8.370000000000003
6.78 6.7799999999999985
5.64 5.639999999999994
5.27 5.270000000000002
7.03 7.0299999999999985
7.4 7.4
7.55 7.550000000000006
5.72 5.72
6.5 6.5
8.1 8.099750000000002
7.22 7.2200000000

In [None]:
# Cross Validation to check if there is overfitting or not

In [81]:
from sklearn.svm import SVR

In [95]:
scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=15, random_state=42, shuffle=True)
for train_index, test_index in cv.split(X) :
    print("Index du train : ", train_index, "\n")
    print("Index de test : ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    best_svr.fit(X_train, y_train)
    scores.append(best_svr.score(X_test, y_test))

Index du train :  [   1    2    4 ... 9983 9984 9985] 

Index de test :  [   0    3   14   29   31   33   35   39   76   80   88   92   96  101
  103  106  107  108  119  131  135  144  251  259  263  265  290  303
  304  311  316  321  330  333  346  349  360  367  388  416  439  450
  457  483  487  500  510  518  532  543  568  576  582  590  592  621
  623  624  647  655  668  673  713  733  735  761  764  794  795  799
  850  856  932  952  965  970  993 1026 1068 1071 1076 1084 1090 1111
 1112 1121 1123 1147 1155 1175 1180 1188 1190 1199 1208 1217 1253 1254
 1277 1310 1315 1318 1330 1339 1347 1360 1372 1373 1413 1446 1453 1470
 1498 1509 1512 1516 1527 1530 1558 1568 1575 1576 1593 1595 1599 1606
 1616 1617 1623 1631 1650 1655 1671 1692 1732 1737 1747 1758 1765 1783
 1785 1793 1795 1803 1851 1880 1897 1901 1915 1923 1957 2012 2018 2025
 2029 2045 2072 2119 2124 2128 2138 2150 2152 2157 2191 2213 2221 2232
 2239 2270 2286 2303 2304 2310 2328 2335 2362 2389 2392 2423 2447 2464
 247

Index du train :  [   0    1    2 ... 9983 9984 9985] 

Index de test :  [  30   45   46   65   73   79   81   87   99  102  111  118  172  177
  180  199  209  210  229  247  257  319  337  350  354  372  385  403
  411  429  449  455  472  473  485  503  528  540  551  564  601  604
  613  618  664  675  678  682  693  718  744  748  749  754  779  810
  811  821  831  834  862  868  947  957  982  997 1005 1017 1027 1042
 1044 1052 1055 1061 1075 1101 1115 1126 1139 1176 1219 1226 1233 1272
 1278 1280 1287 1298 1322 1323 1345 1354 1374 1379 1396 1414 1416 1417
 1421 1430 1444 1452 1456 1468 1477 1502 1557 1559 1570 1582 1600 1603
 1614 1615 1643 1649 1665 1699 1720 1721 1728 1745 1790 1832 1840 1860
 1918 1919 1922 1928 1934 1945 1947 1962 1978 1982 1992 2004 2006 2008
 2013 2022 2034 2054 2057 2083 2087 2088 2089 2094 2107 2108 2111 2115
 2129 2132 2164 2168 2175 2214 2228 2231 2248 2264 2272 2299 2302 2340
 2373 2377 2407 2420 2436 2456 2471 2487 2516 2533 2542 2609 2614 2685
 268

Index du train :  [   0    1    2 ... 9983 9984 9985] 

Index de test :  [  15   28   34   44   63   75   84  124  129  143  170  171  184  223
  240  242  273  279  284  287  293  297  312  320  322  325  339  366
  371  373  375  380  387  418  421  422  430  438  443  452  479  494
  501  505  506  507  553  558  588  598  599  626  640  652  662  672
  684  691  704  725  746  755  783  790  797  808  813  829  838  843
  849  864  889  904  907  911  925  926  931  969  977  978 1032 1049
 1079 1119 1131 1158 1161 1164 1170 1182 1187 1192 1209 1211 1225 1231
 1235 1270 1292 1332 1357 1366 1371 1384 1397 1423 1436 1450 1451 1462
 1464 1471 1475 1499 1506 1514 1534 1539 1541 1547 1553 1563 1564 1590
 1609 1620 1634 1652 1654 1657 1669 1694 1706 1714 1773 1782 1784 1789
 1801 1859 1869 1872 1876 1883 1893 1902 1912 1944 1966 1972 1975 1979
 2017 2031 2035 2059 2076 2077 2080 2081 2095 2099 2112 2118 2123 2146
 2155 2163 2170 2181 2182 2197 2209 2226 2229 2237 2280 2283 2300 2306
 231

Index du train :  [   0    1    2 ... 9983 9984 9985] 

Index de test :  [  24   25   32   52   83   85  115  159  187  194  200  215  234  244
  256  266  276  355  356  357  396  413  436  470  490  492  515  519
  557  628  643  656  674  692  695  719  739  817  818  869  873  885
  888  902  939  942  972  973  979  998 1023 1035 1064 1088 1102 1105
 1106 1116 1134 1137 1140 1196 1207 1216 1227 1228 1237 1238 1264 1275
 1303 1307 1309 1378 1381 1391 1401 1461 1497 1510 1519 1522 1546 1548
 1552 1572 1587 1588 1589 1592 1610 1627 1629 1673 1676 1696 1697 1708
 1723 1751 1767 1772 1776 1809 1815 1845 1847 1863 1878 1890 1909 1946
 1948 1999 2001 2005 2015 2016 2019 2038 2046 2052 2078 2092 2100 2101
 2151 2158 2161 2176 2186 2188 2194 2207 2217 2218 2222 2224 2235 2241
 2277 2311 2329 2341 2342 2369 2380 2384 2409 2422 2437 2450 2451 2490
 2500 2503 2525 2566 2587 2624 2652 2655 2660 2668 2671 2694 2765 2767
 2780 2783 2787 2788 2792 2801 2803 2819 2828 2834 2841 2857 2895 2936
 295

Index du train :  [   0    2    3 ... 9982 9984 9985] 

Index de test :  [   1   11   13   21   38   72   89  105  116  140  148  158  207  213
  236  237  243  261  269  277  281  285  294  306  317  340  352  362
  363  374  405  406  454  458  482  488  514  525  526  529  542  552
  556  560  571  575  583  593  619  629  630  632  634  661  686  703
  707  778  791  792  845  891  894  899  917  933  955  959  963  968
  986  987  988 1000 1007 1012 1040 1048 1050 1058 1072 1098 1100 1125
 1133 1136 1177 1198 1229 1232 1239 1241 1249 1265 1266 1274 1286 1300
 1312 1316 1343 1367 1388 1390 1394 1435 1440 1442 1494 1523 1525 1560
 1591 1601 1607 1667 1682 1689 1704 1709 1712 1725 1735 1742 1763 1766
 1798 1804 1823 1826 1875 1884 1925 1927 1939 1949 1953 1960 1969 1980
 1985 2043 2056 2064 2071 2102 2113 2137 2139 2162 2203 2278 2289 2334
 2349 2353 2356 2359 2365 2382 2383 2396 2401 2403 2408 2428 2430 2469
 2475 2493 2494 2496 2507 2529 2530 2556 2562 2604 2661 2665 2710 2719
 272

In [96]:
scores

[0.9976969426040244,
 0.9979799370225907,
 0.9973350005523101,
 0.9975795771255845,
 0.9978650725957113,
 0.9975985404763329,
 0.9973948098038466,
 0.9977783196600627,
 0.997393383897506,
 0.99776568273255,
 0.9976562885334582,
 0.9975797484778375,
 0.9978087624024269,
 0.9975493669666549,
 0.9979020354330832]

In [120]:
#save model

In [121]:
import pickle
filename = "model.pkl"

pickle.dump(model, open(filename, "wb"))