In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import seaborn as sns

import multiprocessing

import re

import nltk
from nltk.corpus import stopwords

%matplotlib inline

In [2]:
data_tokenize_lemmatize = pd.read_csv('data_tokenize_lemmatize.csv')
data_tokenize_lemmatize.head(2)

Unnamed: 0.1,Unnamed: 0,new_course_id,course_id,title_tokenize_lemmat,about_tokenize_lemmat,syllabus_tokenize_lemmat,instructors_tokenize_lemmat,key_words_tokenize_lemmat,recommendations_tokenize_lemmat
0,0,0,2-speed-it,two speed company surf digital wave bcg perspe...,transform disappear darwinism order adapt digi...,introduction cio digital world steer balance s...,antoine gour vitch vanessa lyon eric baudson,business business essential,fundamentals-of-management entrepreneurial-thi...
1,1,5,3d-printing-applications,printing application,help understand printing applied across number...,orientation module printing new making module ...,vishal sachdev,business entrepreneurship,mechanic beam-bending


In [3]:
len(data_tokenize_lemmatize)

1299

In [4]:
data_tokenize_lemmatize=data_tokenize_lemmatize.drop(['Unnamed: 0'], axis=1)

In [5]:
data_tokenize_lemmatize.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 8 columns):
new_course_id                      1299 non-null int64
course_id                          1299 non-null object
title_tokenize_lemmat              1298 non-null object
about_tokenize_lemmat              1299 non-null object
syllabus_tokenize_lemmat           1296 non-null object
instructors_tokenize_lemmat        1297 non-null object
key_words_tokenize_lemmat          1299 non-null object
recommendations_tokenize_lemmat    1259 non-null object
dtypes: int64(1), object(7)
memory usage: 81.3+ KB


# TFIDF на title

In [6]:
data_tokenize_lemmatize['title_tokenize_lemmat'] = data_tokenize_lemmatize['title_tokenize_lemmat'].astype(str)

In [7]:
cnt_vec = CountVectorizer() # создаёт векторы - количество равно количеству категорий и 
                            #записывает в каждый слот количество встречаемости данной характеристики для объекта 
processed = cnt_vec.fit_transform(data_tokenize_lemmatize['title_tokenize_lemmat'])
tfidf = TfidfTransformer()

In [8]:
tfidf_dense = tfidf.fit_transform(processed).todense()
tfidf_dense

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.52918504,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.5       ]])

In [9]:
cnt_vec.vocabulary_

{'two': 1516,
 'speed': 1367,
 'company': 254,
 'surf': 1413,
 'digital': 374,
 'wave': 1577,
 'bcg': 114,
 'perspective': 1080,
 'printing': 1151,
 'application': 61,
 'revolution': 1253,
 'software': 1344,
 'anatomy': 51,
 'abdomen': 0,
 'pelvis': 1069,
 'journey': 796,
 'basis': 111,
 'clinic': 229,
 'introduction': 766,
 'ableton': 1,
 'live': 844,
 'aboriginal': 2,
 'worldviews': 1606,
 'education': 419,
 'academic': 4,
 'discussion': 388,
 'english': 458,
 'literacy': 842,
 'project': 1173,
 'writing': 1611,
 'research': 1233,
 'paper': 1054,
 'information': 724,
 'seeking': 1304,
 'accounting': 6,
 'analytics': 49,
 'finance': 543,
 'professional': 1166,
 'managerial': 870,
 'cost': 304,
 'behavior': 122,
 'system': 1423,
 'analysis': 47,
 'art': 82,
 'mooc': 948,
 'activism': 13,
 'social': 1338,
 'movement': 954,
 'addicted': 18,
 'brain': 156,
 'adhd': 20,
 'everyday': 496,
 'strategy': 1389,
 'elementary': 435,
 'student': 1397,
 'adjective': 21,
 'clause': 224,
 'advanced':

In [10]:
columns = [None for i in range(len(cnt_vec.vocabulary_))]
for n in cnt_vec.vocabulary_:
    columns[cnt_vec.vocabulary_[n]] = n
    
columns

['abdomen',
 'ableton',
 'aboriginal',
 'abstract',
 'academic',
 'access',
 'accounting',
 'achievement',
 'acquisition',
 'across',
 'act',
 'action',
 'active',
 'activism',
 'activity',
 'actor',
 'actuation',
 'acute',
 'addicted',
 'addressing',
 'adhd',
 'adjective',
 'administrative',
 'adolescent',
 'advanced',
 'adverb',
 'advertising',
 'advice',
 'aerial',
 'africa',
 'age',
 'ageing',
 'agent',
 'agile',
 'agricultural',
 'agriculture',
 'aid',
 'aim',
 'algorithm',
 'algorithmic',
 'allocation',
 'alternative',
 'altruism',
 'america',
 'american',
 'an',
 'analysing',
 'analysis',
 'analytic',
 'analytics',
 'analyzing',
 'anatomy',
 'ancient',
 'android',
 'angular',
 'angularjs',
 'animal',
 'anticipating',
 'antimicrobial',
 'ap',
 'app',
 'application',
 'applied',
 'apply',
 'applying',
 'appreciative',
 'approach',
 'approaching',
 'approximation',
 'apps',
 'apr',
 'ar',
 'arch',
 'archaeoastronomy',
 'archaeology',
 'architecting',
 'architecture',
 'arctic',
 'a

In [11]:
len(columns)

1616

In [12]:
data_tfidf_title = pd.DataFrame(tfidf_dense, columns=columns)
data_tfidf_title.head()

Unnamed: 0,abdomen,ableton,aboriginal,abstract,academic,access,accounting,achievement,acquisition,across,...,worldviews,worm,worst,worth,write,writing,written,year,young,zika
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.416308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# TFIDF на key_words

In [13]:
data_tokenize_lemmatize['key_words_tokenize_lemmat'] = data_tokenize_lemmatize['key_words_tokenize_lemmat'].astype(str)

In [14]:
cnt_vec = CountVectorizer()
processed = cnt_vec.fit_transform(data_tokenize_lemmatize['key_words_tokenize_lemmat'])
tfidf = TfidfTransformer()

In [15]:
tfidf_dense = tfidf.fit_transform(processed).todense()
len(cnt_vec.vocabulary_)

1027

In [16]:
columns = [None for i in range(len(cnt_vec.vocabulary_))]
for n in cnt_vec.vocabulary_:
    columns[cnt_vec.vocabulary_[n]] = n

In [17]:
data_tfidf_key_words = pd.DataFrame(tfidf_dense, columns=columns)
data_tfidf_key_words.head()

Unnamed: 0,abelton,abstract,abuse,academic,accessibility,accounting,accrual,acquisition,active,activity,...,wordpress,work,workflow,wrangling,writing,wxpython,xcode,xml,xp,yield
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
#объединяем рассчитанные значения tfidf по колонкам title & key_words в один большой датафрейм 

In [19]:
data_num_tfidf = data_tokenize_lemmatize[['new_course_id','course_id']]

In [20]:
#label_encoder = LabelEncoder()
#data_num_tfidf['new_course_id'] = pd.Series(label_encoder.fit_transform(data_tokenize_lemmatize['course_id']))
data_num_tfidf.head()

Unnamed: 0,new_course_id,course_id
0,0,2-speed-it
1,5,3d-printing-applications
2,6,3d-printing-revolution
3,7,3d-printing-software
4,9,abdomen-anatomy


In [21]:
len(data_num_tfidf)

1299

In [22]:
data_num_tfidf = data_num_tfidf.join(data_tfidf_title, lsuffix='_caller', rsuffix='_other')
data_num_tfidf.head()

Unnamed: 0,new_course_id,course_id,abdomen,ableton,aboriginal,abstract,academic,access,accounting,achievement,...,worldviews,worm,worst,worth,write,writing,written,year,young,zika
0,0,2-speed-it,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,3d-printing-applications,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,3d-printing-revolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,3d-printing-software,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,abdomen-anatomy,0.416308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
data_num_tfidf = data_num_tfidf.join(data_tfidf_key_words, lsuffix='_caller', rsuffix='_other')
data_num_tfidf.head()

Unnamed: 0,new_course_id,course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
0,0,2-speed-it,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,3d-printing-applications,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,3d-printing-revolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,3d-printing-software,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,abdomen-anatomy,0.416308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# сохраним data_t_s  
data_num_tfidf.to_csv('data_num_tfidf_key_words_title.csv') #выполняется 1 раз

## Рекомендации на основе содержания

In [25]:
# ITEM_TO_USER

In [26]:
data_tfidf_course = pd.read_csv('data_num_tfidf_key_words_title.csv')
data_tfidf_course.head(2)

Unnamed: 0.1,Unnamed: 0,new_course_id,course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
0,0,0,2-speed-it,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,5,3d-printing-applications,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
#избавимся от лишних столбцов
data_tfidf_course=data_tfidf_course.drop(['Unnamed: 0'], axis=1)

In [28]:
len(data_tfidf_course)

1299

In [29]:
#достанем информацию об оценках пользователей из data.csv
data_reviews = pd.read_csv('reviews_eng_courses_1.csv')
data_reviews.head()

Unnamed: 0.1,Unnamed: 0,course_id,reviewer_name,rating,review_text,reviewer_id
0,0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,60888
1,1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,22416
2,2,2-speed-it,Viswas P,4,Videos that are presented in French could've b...,76208
3,3,2-speed-it,AN L,3,"The course content is quite good, though it co...",510
4,4,2-speed-it,Konstantin A,5,"Great piece of work, I especially liked a few ...",39708


In [30]:
#достанем информацию об оценках пользователей из data.csv
data_reviews = pd.read_csv('reviews_eng_courses_1.csv')
#избавимся от лишних столбцов
data_reviews=data_reviews.drop(['Unnamed: 0'], axis=1)
data_reviews=data_reviews.drop(['reviewer_name'], axis=1)
data_reviews=data_reviews.drop(['review_text'], axis=1)

data_reviews.head(2)

Unnamed: 0,course_id,rating,reviewer_id
0,2-speed-it,5,60888
1,2-speed-it,2,22416


In [31]:
data_reviews.rating = data_reviews.rating.astype(int)

In [32]:
#объединим датафрейм описания англоязычных курсов с отзывами пользователей 
data_tfidf =  data_reviews.join(data_tfidf_course.set_index('course_id'), on='course_id')
data_tfidf.head(2)

Unnamed: 0,course_id,rating,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
0,2-speed-it,5,60888,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2-speed-it,2,22416,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
#объединим с датафреймом по оценкам пользователей


In [34]:
#посмотрим сколько отзывов поставил каждый пользователь
data_tfidf.groupby('reviewer_id')[['new_course_id']].count().sort_values('new_course_id', ascending=False)

Unnamed: 0_level_0,new_course_id
reviewer_id,Unnamed: 1_level_1
-1,825
31616,136
34943,125
36039,100
61783,66
51060,65
48932,60
16636,59
17412,58
19166,57


In [35]:
data_tfidf.groupby('new_course_id')[['rating']].count().sort_values('rating', ascending=False)

Unnamed: 0_level_0,rating
new_course_id,Unnamed: 1_level_1
-1,45347
1080,928
665,907
384,897
51,893
394,866
1111,844
995,808
974,801
1250,787


In [36]:
#удалённые имена пользователей = -1

In [37]:
#выберем очень активного пользователя с reviewer_id = 36039
TARGET_USER = 10210
#выбарем все оцененные этим пользователем курсы
df_for_user = data_tfidf[data_tfidf['reviewer_id']==TARGET_USER]
df_for_user

Unnamed: 0,course_id,rating,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
3087,ageofjefferson,5,10210,42,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5387,altruism,5,10210,62,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,ancient-greeks,5,10210,79,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7390,ancient-marine-reptiles,4,10210,80,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8885,archoftitus,5,10210,104,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11119,audio-engineering,4,10210,124,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14792,big-history,5,10210,162,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15318,bioinformatics-methods-1,5,10210,170,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15389,bioinformatics-methods-1,5,10210,170,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21813,client-needs-and-software-requirements,4,10210,270,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df_for_user = df_for_user.loc[(df_for_user['new_course_id'] != -1) &
                                              (df_for_user['new_course_id'] != 1144) &
                                              (df_for_user['new_course_id'] != 170) &
                                              (df_for_user['new_course_id'] != 1250)]

In [39]:
df_for_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26 entries, 3087 to 103879
Columns: 2647 entries, course_id to yield
dtypes: float64(2643), int32(1), int64(2), object(1)
memory usage: 537.8+ KB


In [40]:
df_for_user['rating'] = df_for_user.rating.fillna(value = 0)


In [41]:
df_for_user = df_for_user.drop(['course_id'], axis=1)


In [42]:
len(df_for_user)

26

In [43]:
#df_1 = data_for_user.drop(['rating','average_score'], axis=1)
df_for_user_1 = df_for_user.drop(['rating'], axis=1)
df_for_user_1.head(3)

Unnamed: 0,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
3087,10210,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5387,10210,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,10210,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
from sklearn.linear_model import LinearRegression # метод наименьших квадратов
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [45]:
df_for_user['rating']

3087      5
5387      5
7190      5
7390      4
8885      5
11119     4
14792     5
21813     4
41297     5
41414     5
42931     5
43111     5
45049     5
46227     5
49284     4
53300     5
57090     5
60046     5
62299     5
65458     5
66709     5
79513     5
89016     4
96071     5
98787     4
103879    5
Name: rating, dtype: int32

In [46]:
#y = pd.Series(df_for_user['rating']).array
#y

In [47]:
#разделим обучающую и тестовую выборки
from sklearn.model_selection import train_test_split
X, y = df_for_user_1, df_for_user['rating']
X.head(10)

Unnamed: 0,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
3087,10210,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5387,10210,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,10210,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7390,10210,80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8885,10210,104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11119,10210,124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14792,10210,162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21813,10210,270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41297,10210,479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41414,10210,481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
len(X)

26

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [50]:
X_train

Unnamed: 0,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
62299,10210,751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43111,10210,520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89016,10210,1072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11119,10210,124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46227,10210,582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45049,10210,552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49284,10210,596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,10210,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96071,10210,1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57090,10210,673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
 X_test

Unnamed: 0,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
7390,10210,80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53300,10210,641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5387,10210,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41297,10210,479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21813,10210,270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65458,10210,799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.252863,0.0,0.0,0.0,0.0,0.0


In [52]:
from sklearn.preprocessing import StandardScaler

In [53]:
X.shape[0]

26

In [54]:
y.shape[0]

26

In [55]:
y.value_counts()

5    20
4     6
Name: rating, dtype: int64

In [56]:
X.new_course_id.value_counts()

959     1
673     1
641     1
582     1
520     1
1214    1
270     1
79      1
80      1
721     1
596     1
816     1
799     1
481     1
162     1
1125    1
104     1
42      1
479     1
552     1
751     1
1072    1
62      1
1145    1
124     1
512     1
Name: new_course_id, dtype: int64

In [57]:
#data_tokenize_lemmatize['title_tokenize_lemmat']  = [lemmat_(t) for t in tqdm(data_normalized_tokenize.title_normalized_tokenize)] 

In [58]:
 X_train

Unnamed: 0,reviewer_id,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
62299,10210,751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43111,10210,520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89016,10210,1072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11119,10210,124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46227,10210,582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45049,10210,552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49284,10210,596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,10210,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96071,10210,1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57090,10210,673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)


In [60]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [61]:
model.score(X_test, y_test)

0.11450071006526741

In [62]:
mean_absolute_error(model.predict(X_train), y_train)

5.773159728050814e-16

In [63]:
a = y_train.mean()
y_train.mean()

4.8

In [64]:
mean_absolute_error(np.ones((len(y_train)))*a, y_train)

0.3200000000000001

In [65]:
mean_absolute_error(model.predict(X_test), y_test)

0.3514911264769025

In [66]:
models = [LinearRegression, Lasso, Ridge, SVR, RandomForestRegressor]

In [67]:
for m in models:
    model = m()
    model.fit(X_train, y_train)
    print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mse_train: {:.4f}, mse_test: {:.4f}".format(
        m.__name__,
        model.score(X_train, y_train),
        model.score(X_test, y_test),
        mean_squared_error(model.predict(X_train), y_train),
        mean_squared_error(model.predict(X_test), y_test)
    ))
    

LinearRegression. r2_train: 1.0000, r2_test: 0.1145, mse_train: 0.0000, mse_test: 0.1968
Lasso. r2_train: 0.0000, r2_test: -0.0800, mse_train: 0.1600, mse_test: 0.2400
Ridge. r2_train: 0.9999, r2_test: 0.0897, mse_train: 0.0000, mse_test: 0.2023
SVR. r2_train: 0.1742, r2_test: -0.1830, mse_train: 0.1321, mse_test: 0.2629
RandomForestRegressor. r2_train: 0.7813, r2_test: -0.1250, mse_train: 0.0350, mse_test: 0.2500


In [68]:
df_for_user['new_course_id']

3087        42
5387        62
7190        79
7390        80
8885       104
11119      124
14792      162
21813      270
41297      479
41414      481
42931      512
43111      520
45049      552
46227      582
49284      596
53300      641
57090      673
60046      721
62299      751
65458      799
66709      816
79513      959
89016     1072
96071     1125
98787     1145
103879    1214
Name: new_course_id, dtype: int64

# ITEM_TO_ITEM 

In [69]:
from sklearn.neighbors import NearestNeighbors #попарное расстояние между объектами

In [70]:
#col=col.drop(['course_id'])

In [71]:
df_for_user = df_for_user.drop(['reviewer_id', 'rating'], axis=1)

In [72]:
col = df_for_user.columns


In [73]:
col

Index(['new_course_id', 'abdomen', 'ableton', 'aboriginal', 'abstract_caller',
       'academic_caller', 'access', 'accounting_caller', 'achievement',
       'acquisition_caller',
       ...
       'wordpress', 'work_other', 'workflow', 'wrangling', 'writing_other',
       'wxpython', 'xcode', 'xml', 'xp', 'yield'],
      dtype='object', length=2644)

In [74]:
X_unsup = data_tfidf_course[col]

In [75]:
sc = StandardScaler()

In [76]:
# nn = NearestNeighbors(n_neighbors=10, metric='minkowski', p=2) #var. metric='manhettan'  

In [77]:
nn = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='cosine')

In [78]:
nn.fit(X_unsup)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [79]:
df_for_user['new_course_id']

3087        42
5387        62
7190        79
7390        80
8885       104
11119      124
14792      162
21813      270
41297      479
41414      481
42931      512
43111      520
45049      552
46227      582
49284      596
53300      641
57090      673
60046      721
62299      751
65458      799
66709      816
79513      959
89016     1072
96071     1125
98787     1145
103879    1214
Name: new_course_id, dtype: int64

In [86]:
df_for_user.head(3)

Unnamed: 0,new_course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,acquisition_caller,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
3087,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5387,62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7190,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
data_tfidf_course[data_tfidf_course['new_course_id'] == 80]

Unnamed: 0,new_course_id,course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
57,80,ancient-marine-reptiles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
nn.kneighbors(df_for_user[df_for_user['new_course_id'] == 80][col]) #возвращает id и расстояние до ближайших курсов
nn

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [83]:
neighbors = nn.kneighbors(df_for_user[df_for_user['new_course_id'] == 80][col]) 
neighbors

(array([[1.11022302e-16, 1.42366656e-04, 1.44909543e-04, 1.48808114e-04,
         1.49771454e-04, 1.50885865e-04, 1.51026467e-04, 1.51233121e-04,
         1.51441663e-04, 1.51465083e-04]]),
 array([[ 57, 338, 362, 348, 506, 313, 387, 425, 470, 850]], dtype=int64))

In [84]:
data_tfidf_course.iloc[neighbors[1][0]]

Unnamed: 0,new_course_id,course_id,abdomen,ableton,aboriginal,abstract_caller,academic_caller,access,accounting_caller,achievement,...,wordpress,work_other,workflow,wrangling,writing_other,wxpython,xcode,xml,xp,yield
57,80,ancient-marine-reptiles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
338,451,dino101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
362,481,early-vertebrate-evolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,465,dog-emotion-and-cognition,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
506,687,genetics-evolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
313,419,dental-medicine-penn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
387,519,emergence-of-life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
425,583,evolution-today,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
470,641,forensic-science,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
850,1150,music-as-biology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
