In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import seaborn as sns

import multiprocessing

import re

import nltk
from nltk.corpus import stopwords

%matplotlib inline

In [2]:
data_embeddings = pd.read_csv('df_embeddings.csv')
data_embeddings.head(2)

Unnamed: 0.1,Unnamed: 0,course_id,embeddings_course_descr
0,0,2-speed-it,"[0.062335819005966187, 0.015386137180030346, 0..."
1,1,3d-printing-applications,"[-0.010602016001939774, -0.0638212189078331, 0..."


In [3]:
data_embeddings=data_embeddings.drop(['Unnamed: 0'], axis=1)

In [4]:
vectors = []

for i, row in data_embeddings.iterrows():
    vect_str = row['embeddings_course_descr']
    vect = np.fromstring( vect_str.replace('[', '').replace(']', ''), dtype=np.float, sep=',' )
    vectors.append(vect)

df_vectors = pd.DataFrame(vectors)
df_vectors.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.062336,0.015386,0.001798,0.036211,-0.025498,0.00417,-0.063827,-0.014235,-0.047241,0.063943,...,0.038421,0.010176,0.035355,-0.059017,-0.053254,0.012399,0.056702,-0.061181,-0.05513,0.020926
1,-0.010602,-0.063821,0.044484,0.059159,0.050023,-0.040048,-0.067637,-0.055488,0.055521,-0.004766,...,-0.044688,-0.05119,0.034932,-0.067331,-0.013376,0.031566,0.050381,-0.05732,-0.004899,0.044088


In [5]:
data_embeddings = data_embeddings.join(df_vectors).drop(['embeddings_course_descr'], axis=1)
data_embeddings.head(2)

Unnamed: 0,course_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,2-speed-it,0.062336,0.015386,0.001798,0.036211,-0.025498,0.00417,-0.063827,-0.014235,-0.047241,...,0.038421,0.010176,0.035355,-0.059017,-0.053254,0.012399,0.056702,-0.061181,-0.05513,0.020926
1,3d-printing-applications,-0.010602,-0.063821,0.044484,0.059159,0.050023,-0.040048,-0.067637,-0.055488,0.055521,...,-0.044688,-0.05119,0.034932,-0.067331,-0.013376,0.031566,0.050381,-0.05732,-0.004899,0.044088


In [6]:
len(data_embeddings)

1299

In [7]:
data_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Columns: 513 entries, course_id to 511
dtypes: float64(512), object(1)
memory usage: 5.1+ MB


## Рекомендации на основе содержания

In [8]:
# ITEM_TO_USER

In [9]:
#достанем информацию об оценках пользователей из data.csv
courses_eng_1 = pd.read_csv('courses_eng_1.csv')
courses_eng = courses_eng_1[['new_course_id','course_id']]
"""#избавимся от лишних столбцов
data_reviews=data_reviews.drop(['Unnamed: 0'], axis=1)
data_reviews=data_reviews.drop(['reviewer_name'], axis=1)
data_reviews=data_reviews.drop(['review_text'], axis=1)
"""
courses_eng.head(2)

Unnamed: 0,new_course_id,course_id
0,0,2-speed-it
1,5,3d-printing-applications


In [10]:
len(courses_eng)

1299

In [11]:
data_courses_embeddings = data_embeddings.join(courses_eng.set_index('course_id'), on='course_id')

In [12]:
#достанем информацию об оценках пользователей из data.csv
data_reviews = pd.read_csv('reviews_eng_courses_1.csv')
#избавимся от лишних столбцов
data_reviews=data_reviews.drop(['Unnamed: 0'], axis=1)
data_reviews=data_reviews.drop(['reviewer_name'], axis=1)
data_reviews=data_reviews.drop(['review_text'], axis=1)

data_reviews.head(2)

Unnamed: 0,course_id,rating,reviewer_id
0,2-speed-it,5,60888
1,2-speed-it,2,22416


In [13]:
len(data_reviews)

157938

In [14]:
#объединим датафрейм описания англоязычных курсов с отзывами пользователей 
data_reviews_with_desc =  data_reviews.join(courses_eng.set_index('course_id'), on='course_id')
data_reviews_with_desc.head(3)

Unnamed: 0,course_id,rating,reviewer_id,new_course_id
0,2-speed-it,5,60888,0
1,2-speed-it,2,22416,0
2,2-speed-it,4,76208,0


In [15]:
#data_reviews.rating = data_reviews.rating.astype(int)

In [16]:
len(data_reviews_with_desc)

157938

In [17]:
#объединим датафрейм описания англоязычных курсов с отзывами пользователей 
data_embed_rait =  data_reviews_with_desc.join(data_embeddings.set_index('course_id'), on='course_id')
data_embed_rait.head(2)

Unnamed: 0,course_id,rating,reviewer_id,new_course_id,0,1,2,3,4,5,...,502,503,504,505,506,507,508,509,510,511
0,2-speed-it,5,60888,0,0.062336,0.015386,0.001798,0.036211,-0.025498,0.00417,...,0.038421,0.010176,0.035355,-0.059017,-0.053254,0.012399,0.056702,-0.061181,-0.05513,0.020926
1,2-speed-it,2,22416,0,0.062336,0.015386,0.001798,0.036211,-0.025498,0.00417,...,0.038421,0.010176,0.035355,-0.059017,-0.053254,0.012399,0.056702,-0.061181,-0.05513,0.020926


In [18]:
len(data_embed_rait)

157938

In [19]:
#объединим с датафреймом по оценкам пользователей


In [20]:
#посмотрим сколько отзывов поставил каждый пользователь
data_embed_rait.groupby('reviewer_id')[['course_id']].count().sort_values('course_id', ascending=False)

Unnamed: 0_level_0,course_id
reviewer_id,Unnamed: 1_level_1
-1,825
31616,136
34943,125
36039,100
61783,66
51060,65
48932,60
16636,59
17412,58
19166,57


In [21]:
#удалённые имена пользователей = -1

In [22]:
data_embed_rait.groupby('course_id')[['rating']].count().sort_values('rating', ascending=False)

Unnamed: 0_level_0,rating
course_id,Unnamed: 1_level_1
matlab,928
project-management-basics,917
game-development,907
data-cleaning,897
algorithmic-toolbox,893
data-science-course,866
science-of-meditation,865
mindshift,844
learn-to-program,808
language-theories,801


In [23]:
#выберем очень активного пользователя с reviewer_id = 36039
TARGET_USER = 10210
#выбарем все оцененные этим пользователем курсы
df_for_user = data_embed_rait[data_embed_rait['reviewer_id']==TARGET_USER]
df_for_user

Unnamed: 0,course_id,rating,reviewer_id,new_course_id,0,1,2,3,4,5,...,502,503,504,505,506,507,508,509,510,511
3087,ageofjefferson,5,10210,42,-0.017314,-0.001662,-0.02784,-0.061607,0.042819,-0.041902,...,0.041509,-0.056876,0.063003,0.000688,-0.022695,0.037509,-0.023223,-0.057655,-0.0101,0.045322
5387,altruism,5,10210,62,-0.008544,-0.022144,0.048564,0.030201,0.011163,-0.021466,...,0.014264,-0.069827,0.010156,0.009863,0.024559,-0.037045,0.028299,-0.065835,-0.064841,0.011514
7190,ancient-greeks,5,10210,79,0.029129,0.006605,-0.054073,-0.06188,0.057992,0.051585,...,-0.049401,-0.034935,0.066914,0.01557,-0.056683,0.030885,-0.006883,-0.049098,0.023755,-0.057493
7390,ancient-marine-reptiles,4,10210,80,-0.027196,0.062703,0.018533,-0.031129,0.020533,0.065593,...,-0.028296,0.056668,0.051834,-0.017262,0.024354,-0.061549,0.044994,-0.055898,-0.046579,-0.014056
8885,archoftitus,5,10210,104,0.007273,0.048132,0.034198,-0.044644,0.054894,-0.035962,...,-0.056768,-0.029797,0.05818,0.009222,-0.058039,-0.004811,0.011889,-0.057989,-0.05773,-0.059011
11119,audio-engineering,4,10210,124,0.030083,0.029936,0.016344,0.029272,-0.012876,-0.025706,...,0.068653,-0.044787,0.004058,-0.054262,0.00708,-0.053128,0.008108,-0.048641,0.035973,0.033989
14792,big-history,5,10210,162,0.035675,0.050006,-0.036288,-0.041,0.06692,-0.016644,...,0.00917,-0.02188,0.067359,-0.064099,0.039035,0.069284,0.042264,-0.06662,-0.01477,-0.051498
15318,bioinformatics-methods-1,5,10210,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
15389,bioinformatics-methods-1,5,10210,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
21813,client-needs-and-software-requirements,4,10210,270,0.010585,-0.033856,-0.017261,0.05884,-0.070238,0.064478,...,-0.054537,0.038241,0.037796,-0.042566,-0.071873,-0.06548,-0.03624,-0.038781,-0.046021,0.064119


In [24]:
'''data_embed_rait = data_embed_rait.loc[(data_embed_rait['new_course_id'] != -1) &
                                              (data_embed_rait['new_course_id'] != 1144) &
                                              (data_embed_rait['new_course_id'] != 170) &
                                              (data_embed_rait['new_course_id'] != 1250)]'''

"data_embed_rait = data_embed_rait.loc[(data_embed_rait['new_course_id'] != -1) &\n                                              (data_embed_rait['new_course_id'] != 1144) &\n                                              (data_embed_rait['new_course_id'] != 170) &\n                                              (data_embed_rait['new_course_id'] != 1250)]"

In [25]:
data_embed_rait.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157938 entries, 0 to 157937
Columns: 516 entries, course_id to 511
dtypes: float64(512), int64(3), object(1)
memory usage: 621.8+ MB


In [26]:
df_for_user['rating'] = df_for_user.rating.fillna(value = 0)


In [27]:
df_for_user = df_for_user.drop(['reviewer_id'], axis=1)


In [28]:
len(df_for_user)

50

In [29]:
#df_1 = data_for_user.drop(['rating','average_score'], axis=1)
df_for_user_1 = df_for_user.drop(['course_id'], axis=1)
df_for_user_1.head(3)

Unnamed: 0,rating,new_course_id,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
3087,5,42,-0.017314,-0.001662,-0.02784,-0.061607,0.042819,-0.041902,-0.059788,0.0247,...,0.041509,-0.056876,0.063003,0.000688,-0.022695,0.037509,-0.023223,-0.057655,-0.0101,0.045322
5387,5,62,-0.008544,-0.022144,0.048564,0.030201,0.011163,-0.021466,-0.076632,-0.052792,...,0.014264,-0.069827,0.010156,0.009863,0.024559,-0.037045,0.028299,-0.065835,-0.064841,0.011514
7190,5,79,0.029129,0.006605,-0.054073,-0.06188,0.057992,0.051585,-0.061866,0.03473,...,-0.049401,-0.034935,0.066914,0.01557,-0.056683,0.030885,-0.006883,-0.049098,0.023755,-0.057493


In [30]:
from sklearn.linear_model import LinearRegression # метод наименьших квадратов
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [31]:
data_embed_rait['rating']

0         5
1         2
2         4
3         3
4         5
5         5
6         4
7         5
8         4
9         4
10        4
11        5
12        2
13        4
14        5
15        5
16        5
17        5
18        5
19        5
20        3
21        5
22        5
23        5
24        5
25        5
26        5
27        3
28        5
29        5
         ..
157908    3
157909    5
157910    4
157911    5
157912    5
157913    5
157914    4
157915    4
157916    5
157917    4
157918    5
157919    5
157920    4
157921    5
157922    5
157923    3
157924    4
157925    5
157926    3
157927    5
157928    4
157929    4
157930    4
157931    3
157932    4
157933    5
157934    5
157935    4
157936    4
157937    5
Name: rating, Length: 157938, dtype: int64

In [32]:
#y = pd.Series(df_for_user['rating']).array
#y

In [33]:
#разделим обучающую и тестовую выборки
from sklearn.model_selection import train_test_split
X, y = df_for_user_1, df_for_user['rating']
X.head(10)

Unnamed: 0,rating,new_course_id,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
3087,5,42,-0.017314,-0.001662,-0.02784,-0.061607,0.042819,-0.041902,-0.059788,0.0247,...,0.041509,-0.056876,0.063003,0.000688,-0.022695,0.037509,-0.023223,-0.057655,-0.0101,0.045322
5387,5,62,-0.008544,-0.022144,0.048564,0.030201,0.011163,-0.021466,-0.076632,-0.052792,...,0.014264,-0.069827,0.010156,0.009863,0.024559,-0.037045,0.028299,-0.065835,-0.064841,0.011514
7190,5,79,0.029129,0.006605,-0.054073,-0.06188,0.057992,0.051585,-0.061866,0.03473,...,-0.049401,-0.034935,0.066914,0.01557,-0.056683,0.030885,-0.006883,-0.049098,0.023755,-0.057493
7390,4,80,-0.027196,0.062703,0.018533,-0.031129,0.020533,0.065593,-0.065088,-0.057435,...,-0.028296,0.056668,0.051834,-0.017262,0.024354,-0.061549,0.044994,-0.055898,-0.046579,-0.014056
8885,5,104,0.007273,0.048132,0.034198,-0.044644,0.054894,-0.035962,-0.060243,0.060259,...,-0.056768,-0.029797,0.05818,0.009222,-0.058039,-0.004811,0.011889,-0.057989,-0.05773,-0.059011
11119,4,124,0.030083,0.029936,0.016344,0.029272,-0.012876,-0.025706,-0.068088,-0.035621,...,0.068653,-0.044787,0.004058,-0.054262,0.00708,-0.053128,0.008108,-0.048641,0.035973,0.033989
14792,5,162,0.035675,0.050006,-0.036288,-0.041,0.06692,-0.016644,-0.066785,0.026731,...,0.00917,-0.02188,0.067359,-0.064099,0.039035,0.069284,0.042264,-0.06662,-0.01477,-0.051498
15318,5,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,-0.060132,-0.032502,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
15389,5,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,-0.060132,-0.032502,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
21813,4,270,0.010585,-0.033856,-0.017261,0.05884,-0.070238,0.064478,-0.065521,0.000144,...,-0.054537,0.038241,0.037796,-0.042566,-0.071873,-0.06548,-0.03624,-0.038781,-0.046021,0.064119


In [34]:
len(X)

50

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [36]:
X_train

Unnamed: 0,rating,new_course_id,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
133737,5,-1,-0.031313,0.01446,-0.034069,-0.051523,0.055652,0.000364,-0.06767,-0.055234,...,-0.048523,-0.05908,0.060261,-0.063069,-0.032272,-0.020207,0.028496,-0.059801,0.025399,0.006905
150218,5,-1,0.020991,-0.033611,-0.053113,-0.022784,-0.065757,-0.012215,-0.067224,0.009345,...,-0.023699,-0.008298,0.027578,0.052973,-0.078004,0.086532,0.066997,-0.072254,-0.061693,0.012347
8885,5,104,0.007273,0.048132,0.034198,-0.044644,0.054894,-0.035962,-0.060243,0.060259,...,-0.056768,-0.029797,0.05818,0.009222,-0.058039,-0.004811,0.011889,-0.057989,-0.05773,-0.059011
60046,5,721,0.054615,-0.055962,0.05656,-0.059532,0.057471,-0.026571,-0.060742,0.029862,...,0.03988,0.032195,0.024019,-0.054682,-0.055463,0.055542,0.052839,-0.058693,-0.039227,-0.007376
15389,5,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,-0.060132,-0.032502,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
143090,5,-1,-0.049081,-0.035454,-0.006034,0.0142,-0.03041,-0.004324,-0.067779,0.063862,...,-0.013543,-0.033897,0.060465,-0.063134,-0.068177,-0.023318,0.06333,-0.068047,0.066011,0.05104
98787,4,1145,0.004868,0.017723,0.00621,0.050997,0.065111,0.039265,-0.065057,0.045593,...,-0.064788,0.004511,0.020924,0.037246,-0.047469,0.065503,0.055525,-0.05364,0.044765,-0.011054
65458,5,799,-0.054637,0.060314,-0.023996,0.055392,0.040795,-0.046116,-0.06166,0.030176,...,-0.060553,-0.042187,0.039284,-0.061202,-0.033388,-0.046044,0.045521,-0.059731,0.026198,-0.013895
53300,5,641,-0.062531,0.053608,-0.034125,0.061215,0.064335,0.003215,-0.065356,-0.023798,...,-0.051515,-0.023293,-0.013875,0.065484,0.042777,-0.06154,-0.022861,0.060186,0.057429,0.037181
124164,3,-1,0.044355,-0.010935,-0.059218,-0.050563,0.02115,0.057773,-0.060268,-0.002302,...,-0.059217,0.01711,0.059004,0.036112,0.059312,-0.05496,0.004011,-0.06027,0.041669,0.052099


In [37]:
 X_test

Unnamed: 0,rating,new_course_id,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
57090,5,673,0.067491,0.023109,-0.030929,-0.010154,-0.002486,0.05568,-0.040293,0.020157,...,-0.042929,-0.03746,-0.004657,-0.065987,-0.025181,0.027771,-0.038874,-0.062489,0.016949,-0.022048
133484,5,-1,-0.013263,-0.034002,-0.019516,-0.024008,0.021329,-0.033082,-0.06506,-0.062377,...,-0.00461,-0.058111,0.06406,-0.059709,-0.061386,-0.034805,0.037178,-0.053927,-0.02083,-0.003255
7390,4,80,-0.027196,0.062703,0.018533,-0.031129,0.020533,0.065593,-0.065088,-0.057435,...,-0.028296,0.056668,0.051834,-0.017262,0.024354,-0.061549,0.044994,-0.055898,-0.046579,-0.014056
46227,5,582,0.063796,-0.018691,-0.016476,-0.023426,0.050766,-0.042572,-0.064628,0.033513,...,-0.062914,-0.043289,-0.025119,-0.048649,-0.0538,0.058863,0.053507,-0.052933,-0.043914,0.047358
152109,4,-1,0.057071,0.050867,-0.052889,0.058073,0.046598,-0.029833,-0.061522,-0.02989,...,-0.060776,0.022925,0.05343,-0.059575,0.034225,-0.058102,-0.020995,-0.044008,-0.008505,-0.022519
103879,5,1214,0.01565,0.050177,-0.057658,-0.000659,0.042336,-0.041461,-0.05851,-0.043683,...,-0.055481,-0.057412,0.047258,-0.047168,-0.050968,-0.004871,0.058588,-0.059119,0.037931,0.057954
11119,4,124,0.030083,0.029936,0.016344,0.029272,-0.012876,-0.025706,-0.068088,-0.035621,...,0.068653,-0.044787,0.004058,-0.054262,0.00708,-0.053128,0.008108,-0.048641,0.035973,0.033989
7190,5,79,0.029129,0.006605,-0.054073,-0.06188,0.057992,0.051585,-0.061866,0.03473,...,-0.049401,-0.034935,0.066914,0.01557,-0.056683,0.030885,-0.006883,-0.049098,0.023755,-0.057493
41414,5,481,-0.015435,0.026983,-0.02178,0.044318,0.005881,0.061512,-0.071372,-0.022343,...,-0.047292,-0.02644,0.04941,-0.069754,0.032357,-0.052468,0.04989,-0.067319,-0.034471,-0.013737
42931,5,512,-0.004366,-0.007844,0.007871,0.01523,0.036596,0.037769,-0.075639,-0.026189,...,0.05644,0.036536,0.033427,-0.019875,-0.044077,-0.052092,0.048466,0.004305,-0.010082,0.020421


In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
X.shape[0]

50

In [40]:
y.shape[0]

50

In [41]:
y.value_counts()

5    37
4     9
3     3
1     1
Name: rating, dtype: int64

In [42]:
X.new_course_id.value_counts()

-1       18
 1144     2
 170      2
 1250     2
 959      1
 673      1
 641      1
 582      1
 1145     1
 520      1
 1214     1
 270      1
 79       1
 80       1
 721      1
 596      1
 479      1
 62       1
 481      1
 162      1
 1125     1
 104      1
 42       1
 552      1
 816      1
 1072     1
 751      1
 799      1
 124      1
 512      1
Name: new_course_id, dtype: int64

In [43]:
 X_train

Unnamed: 0,rating,new_course_id,0,1,2,3,4,5,6,7,...,502,503,504,505,506,507,508,509,510,511
133737,5,-1,-0.031313,0.01446,-0.034069,-0.051523,0.055652,0.000364,-0.06767,-0.055234,...,-0.048523,-0.05908,0.060261,-0.063069,-0.032272,-0.020207,0.028496,-0.059801,0.025399,0.006905
150218,5,-1,0.020991,-0.033611,-0.053113,-0.022784,-0.065757,-0.012215,-0.067224,0.009345,...,-0.023699,-0.008298,0.027578,0.052973,-0.078004,0.086532,0.066997,-0.072254,-0.061693,0.012347
8885,5,104,0.007273,0.048132,0.034198,-0.044644,0.054894,-0.035962,-0.060243,0.060259,...,-0.056768,-0.029797,0.05818,0.009222,-0.058039,-0.004811,0.011889,-0.057989,-0.05773,-0.059011
60046,5,721,0.054615,-0.055962,0.05656,-0.059532,0.057471,-0.026571,-0.060742,0.029862,...,0.03988,0.032195,0.024019,-0.054682,-0.055463,0.055542,0.052839,-0.058693,-0.039227,-0.007376
15389,5,170,-0.040018,0.051751,-0.052735,0.05996,0.02856,0.031396,-0.060132,-0.032502,...,-0.052227,0.011865,0.047031,-0.058142,-0.054819,-0.056414,0.036939,-0.059972,0.051138,0.058576
143090,5,-1,-0.049081,-0.035454,-0.006034,0.0142,-0.03041,-0.004324,-0.067779,0.063862,...,-0.013543,-0.033897,0.060465,-0.063134,-0.068177,-0.023318,0.06333,-0.068047,0.066011,0.05104
98787,4,1145,0.004868,0.017723,0.00621,0.050997,0.065111,0.039265,-0.065057,0.045593,...,-0.064788,0.004511,0.020924,0.037246,-0.047469,0.065503,0.055525,-0.05364,0.044765,-0.011054
65458,5,799,-0.054637,0.060314,-0.023996,0.055392,0.040795,-0.046116,-0.06166,0.030176,...,-0.060553,-0.042187,0.039284,-0.061202,-0.033388,-0.046044,0.045521,-0.059731,0.026198,-0.013895
53300,5,641,-0.062531,0.053608,-0.034125,0.061215,0.064335,0.003215,-0.065356,-0.023798,...,-0.051515,-0.023293,-0.013875,0.065484,0.042777,-0.06154,-0.022861,0.060186,0.057429,0.037181
124164,3,-1,0.044355,-0.010935,-0.059218,-0.050563,0.02115,0.057773,-0.060268,-0.002302,...,-0.059217,0.01711,0.059004,0.036112,0.059312,-0.05496,0.004011,-0.06027,0.041669,0.052099


In [44]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)


In [45]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
model.score(X_test, y_test)

0.28986878851923237

In [47]:
mean_absolute_error(model.predict(X_train), y_train)

3.83026943495679e-15

In [48]:
a = y_train.mean()
y_train.mean()

4.6

In [49]:
mean_absolute_error(np.ones((len(y_train)))*a, y_train)

0.6000000000000002

In [50]:
mean_absolute_error(model.predict(X_test), y_test)

0.3018211671170669

In [51]:
models = [LinearRegression, Lasso, Ridge, SVR, RandomForestRegressor]

In [52]:
for m in models:
    model = m()
    model.fit(X_train, y_train)
    print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mse_train: {:.4f}, mse_test: {:.4f}".format(
        m.__name__,
        model.score(X_train, y_train),
        model.score(X_test, y_test),
        mean_squared_error(model.predict(X_train), y_train),
        mean_squared_error(model.predict(X_test), y_test)
    ))
    

LinearRegression. r2_train: 1.0000, r2_test: 0.2899, mse_train: 0.0000, mse_test: 0.1491
Lasso. r2_train: 0.0000, r2_test: -0.0476, mse_train: 0.6900, mse_test: 0.2200
Ridge. r2_train: 0.9969, r2_test: 0.5785, mse_train: 0.0021, mse_test: 0.0885
SVR. r2_train: 0.4648, r2_test: 0.1632, mse_train: 0.3693, mse_test: 0.1757
RandomForestRegressor. r2_train: 0.9779, r2_test: -1.6000, mse_train: 0.0152, mse_test: 0.5460


# ITEM_TO_ITEM 

In [53]:
from sklearn.neighbors import NearestNeighbors #попарное расстояние между объектами

In [54]:
#col=col.drop(['course_id'])

In [55]:
df_for_user = df_for_user.drop(['rating','course_id'], axis=1)

In [56]:
col = df_for_user.columns


In [57]:
col

Index(['new_course_id',               0,               1,               2,
                     3,               4,               5,               6,
                     7,               8,
       ...
                   502,             503,             504,             505,
                   506,             507,             508,             509,
                   510,             511],
      dtype='object', length=513)

In [58]:
X_unsup = data_courses_embeddings[col]

In [59]:
sc = StandardScaler()

In [60]:
# nn = NearestNeighbors(n_neighbors=10, metric='minkowski', p=2) #var. metric='manhettan'  

In [61]:
nn = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='cosine')

In [62]:
nn.fit(X_unsup)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [63]:
df_for_user['new_course_id']

3087        42
5387        62
7190        79
7390        80
8885       104
11119      124
14792      162
15318      170
15389      170
21813      270
41297      479
41414      481
42931      512
43111      520
45049      552
46227      582
49284      596
53300      641
57090      673
60046      721
62299      751
65458      799
66709      816
79513      959
89016     1072
96071     1125
98304     1144
98305     1144
98787     1145
103879    1214
106231    1250
106302    1250
113171      -1
124164      -1
126586      -1
126606      -1
128432      -1
128910      -1
133019      -1
133484      -1
133737      -1
133769      -1
141485      -1
142592      -1
143024      -1
143090      -1
143954      -1
150071      -1
150218      -1
152109      -1
Name: new_course_id, dtype: int64

In [64]:
df_for_user.head(3)

Unnamed: 0,new_course_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
3087,42,-0.017314,-0.001662,-0.02784,-0.061607,0.042819,-0.041902,-0.059788,0.0247,0.044916,...,0.041509,-0.056876,0.063003,0.000688,-0.022695,0.037509,-0.023223,-0.057655,-0.0101,0.045322
5387,62,-0.008544,-0.022144,0.048564,0.030201,0.011163,-0.021466,-0.076632,-0.052792,0.034863,...,0.014264,-0.069827,0.010156,0.009863,0.024559,-0.037045,0.028299,-0.065835,-0.064841,0.011514
7190,79,0.029129,0.006605,-0.054073,-0.06188,0.057992,0.051585,-0.061866,0.03473,0.056024,...,-0.049401,-0.034935,0.066914,0.01557,-0.056683,0.030885,-0.006883,-0.049098,0.023755,-0.057493


In [65]:
courses_eng[courses_eng['new_course_id'] == 80]

Unnamed: 0,new_course_id,course_id
57,80,ancient-marine-reptiles


In [66]:
nn.kneighbors(df_for_user[df_for_user['new_course_id'] == 80][col]) #возвращает id и расстояние до ближайших курсов
nn

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [67]:
neighbors = nn.kneighbors(df_for_user[df_for_user['new_course_id'] == 80][col]) 
neighbors

(array([[8.88178420e-16, 6.22656117e-05, 6.24964167e-05, 6.68312342e-05,
         6.78661378e-05, 6.94024613e-05, 6.95084680e-05, 6.97580132e-05,
         7.01014341e-05, 7.03582946e-05]]),
 array([[ 57, 338, 362, 179, 363, 106, 131, 425, 364, 186]], dtype=int64))

In [68]:
data_courses_embeddings.iloc[neighbors[1][0]]

Unnamed: 0,course_id,0,1,2,3,4,5,6,7,8,...,503,504,505,506,507,508,509,510,511,new_course_id
57,ancient-marine-reptiles,-0.027196,0.062703,0.018533,-0.031129,0.020533,0.065593,-0.065088,-0.057435,-0.025556,...,0.056668,0.051834,-0.017262,0.024354,-0.061549,0.044994,-0.055898,-0.046579,-0.014056,80
338,dino101,-0.061251,-0.033903,0.034087,0.060027,0.03052,0.006023,-0.07033,-0.03649,-0.008568,...,0.062748,0.042215,-0.061651,-0.038642,-0.052277,0.040712,-0.027639,0.039761,-0.025011,451
362,early-vertebrate-evolution,-0.015435,0.026983,-0.02178,0.044318,0.005881,0.061512,-0.071372,-0.022343,0.028361,...,-0.02644,0.04941,-0.069754,0.032357,-0.052468,0.04989,-0.067319,-0.034471,-0.013737,481
179,changing-arctic,-0.038721,-0.034537,-0.064661,0.015488,0.057139,0.012153,-0.068226,0.054072,0.020669,...,-0.0471,-0.045643,0.00964,-0.060391,-0.050679,0.006961,-0.059467,-0.027682,-0.039804,244
363,earth-amnh,0.055042,0.000593,-0.066244,-0.055722,0.03067,0.005536,-0.066686,0.05924,0.024047,...,0.032477,0.066464,-0.060602,-0.007536,-0.033887,0.046037,-0.064667,0.005629,-0.045169,482
106,becoming-a-veterinarian,-0.052769,0.017895,0.044355,0.070823,0.003081,0.034146,-0.070907,-0.045489,-0.011852,...,-0.011756,0.002633,-0.030082,-0.069238,-0.05128,0.055079,-0.064356,-0.029166,-0.02832,143
131,biological-diversity,0.006164,-0.002703,-0.061931,-0.058711,0.037066,-0.03766,-0.063123,0.020778,0.029938,...,-0.003102,0.013313,-0.035621,-0.061024,-0.054012,0.052995,-0.062375,0.001895,-0.026525,175
425,evolution-today,-0.034808,0.05824,0.012108,0.045887,0.043702,-0.051557,-0.055843,0.04781,-0.043124,...,-0.033729,0.062667,-0.039766,0.005145,0.053868,0.030185,-0.055958,0.060343,-0.011592,583
364,earth-climate-change,-0.036185,-0.021097,-0.061691,0.02675,0.062992,0.023586,-0.062663,-0.000149,0.016529,...,-0.042928,0.048776,0.037624,-0.036222,-0.045056,0.05078,-0.048987,-0.000134,-0.016552,483
186,chickens,-0.051215,-0.04483,0.061236,0.060189,0.058194,0.051456,-0.059735,-0.046091,-0.049992,...,-0.046831,-0.042418,-0.033358,-0.024213,-0.060631,0.057451,-0.059085,0.016569,-0.016709,251
