# Диплом НЕТОЛОГИИ

## Рекомендательная система для подбора образовательных курсов



Задача: рекомендовать пользователю наилучший образовательный курс по его запросу и предпочтениям
Идея: сравнивать образовательные курсы так же как сравнивают фильмы - по оценкам пользователей, жанрам (областям научного знания)

## Импорт библиотек

In [23]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score

import seaborn as sns

import multiprocessing

import re

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

%matplotlib inline

## Загрузка данных

In [24]:
#датасет с отзывами по англоязычным курсам
reviews_eng_courses = pd.read_csv('reviews_eng_courses.csv')
reviews_eng_courses.head()

Unnamed: 0,url,course_id,reviewer_name,rating,review_text
0,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs
1,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Etienne R,2,This course does not say anything about digiti...
2,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Viswas P,4,Videos that are presented in French could've b...
3,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,AN L,3,"The course content is quite good, though it co..."
4,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Konstantin A,5,"Great piece of work, I especially liked a few ..."


In [25]:
reviews_eng_courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160092 entries, 0 to 160091
Data columns (total 5 columns):
url              160092 non-null object
course_id        160092 non-null object
reviewer_name    159841 non-null object
rating           160092 non-null int64
review_text      160085 non-null object
dtypes: int64(1), object(4)
memory usage: 6.1+ MB


## Базовая рекомендация курсов на основании оценок пользователей - холодный старт

In [26]:
# посмотрим на количество рейтингов по каждому курсу 
course_id_num_ratings = {}

for i, group in tqdm_notebook(reviews_eng_courses.groupby('course_id')):
    course_id_num_ratings[i] = group.reviewer_name.unique().shape[0]
    
course_id_num_ratings

HBox(children=(IntProgress(value=0, max=1298), HTML(value='')))




{'2-speed-it': 33,
 '3d-printing-applications': 88,
 '3d-printing-revolution': 204,
 '3d-printing-software': 58,
 'abdomen-anatomy': 89,
 'ableton-live': 239,
 'aboriginal-education': 33,
 'academic-discussion-english': 6,
 'academic-literacy': 27,
 'academic-writing-capstone': 17,
 'academicinfoseek': 47,
 'accounting-analytics': 312,
 'accounting-finance': 25,
 'accounting-for-managers': 98,
 'activism-social-movements': 16,
 'addiction-and-the-brain': 304,
 'adhd-treatment': 215,
 'adjective-clauses': 112,
 'advanced-algorithms-and-complexity': 67,
 'advanced-chemistry': 96,
 'advanced-competitive-strategy': 105,
 'advanced-data-structures': 228,
 'advanced-excel': 232,
 'advanced-manufacturing-enterprise': 22,
 'advanced-manufacturing-process-analysis': 45,
 'advanced-modeling': 12,
 'advanced-neurobiology1': 72,
 'advanced-r': 91,
 'advanced-trading-algorithms': 22,
 'advanced-writing': 137,
 'advancedneurobiologyii': 17,
 'ageing': 14,
 'ageofjefferson': 83,
 'agile-planning-for-

In [27]:
# возьмем топ курсов по оценкам
top_courses = reviews_eng_courses.groupby('course_id')[['rating']].mean().sort_values('rating', ascending=False)
# из них возьмем курсы с наивысшей средней оценкой в 5.0
courses_with_highest_rating = top_courses.iloc[np.where(top_courses.rating == 5.0)].index
#в принципе эту выборку уже можно использовать для самой простой базовой рекомендации новому пользователю
courses_with_highest_rating

Index(['soscapproachchina1', 'advanced-modeling',
       'analysis-business-problem-iese', 'interpersonal-communication', 'aoo',
       'market-efficiency', 'e-learning', 'transreality-gaming', 'bd2k-lincs',
       'fundamentals-of-computing-capstone', 'big-data-cloud-computing-cdn',
       'investment-strategy-capstone', 'bioinformatics-project',
       'teaching-common-grammar-mistakes', 'professional-brand',
       'programming-languages-part-c', 'youth-health',
       'business-english-capstone', 'systems-biology-capstone',
       'linear-models-2', 'career-planning', 'multimodal-literacies'],
      dtype='object', name='course_id')

In [28]:
# выведем топ курсов со средней оценкой в 5.0 по количеству отзывов
sorted([(course_id_num_ratings[f], f) for f in courses_with_highest_rating], key=lambda x: x[0], reverse=True)[:10]

[(54, 'programming-languages-part-c'),
 (18, 'fundamentals-of-computing-capstone'),
 (17, 'youth-health'),
 (12, 'advanced-modeling'),
 (10, 'business-english-capstone'),
 (7, 'professional-brand'),
 (5, 'big-data-cloud-computing-cdn'),
 (4, 'bd2k-lincs'),
 (3, 'soscapproachchina1'),
 (3, 'analysis-business-problem-iese')]

In [29]:
# средний рейтинг на каждый курс
course_id_mean_rating = {}

for i, group in tqdm_notebook(reviews_eng_courses.groupby('course_id')):
    course_id_mean_rating[i] = group.rating.mean()
    
course_id_mean_rating

HBox(children=(IntProgress(value=0, max=1298), HTML(value='')))




{'2-speed-it': 4.303030303030303,
 '3d-printing-applications': 4.431818181818182,
 '3d-printing-revolution': 4.717073170731707,
 '3d-printing-software': 4.413793103448276,
 'abdomen-anatomy': 4.786516853932584,
 'ableton-live': 4.75,
 'aboriginal-education': 4.515151515151516,
 'academic-discussion-english': 4.333333333333333,
 'academic-literacy': 4.592592592592593,
 'academic-writing-capstone': 4.764705882352941,
 'academicinfoseek': 4.595744680851064,
 'accounting-analytics': 4.4006410256410255,
 'accounting-finance': 4.16,
 'accounting-for-managers': 4.73469387755102,
 'activism-social-movements': 4.6875,
 'addiction-and-the-brain': 4.488524590163935,
 'adhd-treatment': 4.714285714285714,
 'adjective-clauses': 4.714285714285714,
 'advanced-algorithms-and-complexity': 4.104477611940299,
 'advanced-chemistry': 4.65625,
 'advanced-competitive-strategy': 4.609523809523809,
 'advanced-data-structures': 4.7105263157894735,
 'advanced-excel': 4.797413793103448,
 'advanced-manufacturing-en

In [30]:
# достанем простые статистики по количеству рейтингов
min_num_ratings = np.min([course_id_num_ratings[f] for f in course_id_num_ratings.keys()])
max_num_ratings = np.max([course_id_num_ratings[f] for f in course_id_num_ratings.keys()])
mean_num_ratings = np.mean([course_id_num_ratings[f] for f in course_id_num_ratings.keys()])
median_num_ratings = np.median([course_id_num_ratings[f] for f in course_id_num_ratings.keys()])
print('Минимальное количество оценок на курс: ', min_num_ratings)
print('Максимальное количество оценок на курс: ', max_num_ratings)
print(f'Среднее количество оценок на курс: {mean_num_ratings:.2f}')
print('Медианное количество оценок на курс: ', median_num_ratings)

Минимальное количество оценок на курс:  1
Максимальное количество оценок на курс:  932
Среднее количество оценок на курс: 122.73
Медианное количество оценок на курс:  61.0


In [31]:
# Oбъединённый датафрейм описания англоязычных курсов с отзывами пользователей 
joined_users_reviews = pd.read_csv('joined_users_reviews.csv')
#избавимся от лишних столбцов
joined_users_reviews=joined_users_reviews.drop(['Unnamed: 0'], axis=1)
joined_users_reviews['skills'] = joined_users_reviews.skills.fillna(value = '')
joined_users_reviews.head(2)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url
0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it


In [32]:
#подготовленный датасет
data = joined_users_reviews
data.head(2)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url
0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it


### Рекомендация курсов на основании оценок и количества отзывов по курсу
- для нового пользователя
- по запросу "выучить что-нибудь новое"

In [33]:
# отсортируем курсы по следующей метрике: средняя оценка курса, умноженная на нормированное количество рейтингов
course_new_mark = []
for f in course_id_num_ratings.keys():
    course_new_mark.append(
        (f, course_id_mean_rating[f] * (course_id_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings)))
    
    

In [34]:
# топ 5 курсов по средним оценкам и количеству рейтингов
top_5 = list(sorted(course_new_mark, key=lambda x: x[1], reverse=True))[:5]
top_5

[('matlab', 4.210775570760129),
 ('game-development', 3.9745285499775154),
 ('project-management-basics', 3.8865173314100336),
 ('science-of-meditation', 3.8255755398407887),
 ('algorithmic-toolbox', 3.7213195462974857)]

In [40]:
# сортируем курсы по средним оценкам и количеству рейтингов
top_ = dict(sorted(course_new_mark, key=lambda x: x[1], reverse=True))
dict_keys = list(top_.keys()) # список course_id - результат отбора и сортировки - решение задачи 1

In [41]:
for k in top_.keys(): 
    print(k)

matlab
game-development
project-management-basics
science-of-meditation
algorithmic-toolbox
mindshift
data-cleaning
language-theories
data-science-course
learn-to-program
philosophy
uva-darden-design-thinking-innovation
ml-regression
professional-emails-english
positive-psychology-visionary-science
object-oriented-java
careerdevelopment
introduction-psych
website-coding
search-engine-optimization
introcss
python-programming-introduction
gis
wharton-quantitative-modeling
arduino-platform
dog-emotion-and-cognition
positive-psychology
interactive-python-1
probability-intro
introduction-to-software-product-management
excel-data-analysis
exploratory-data-analysis
decision-making
what-is-social
sustainable-development
psychological-first-aid
financial-markets
craft-of-plot
javascript
basic-statistics
python-data-visualization
speak-english-professionally
private-equity
exposure-photography
analytics-excel
strategic-management
photography
managing-human-resources
crypto
analytics-mysql
solid-

big-ideas
happiness
zhexue-daolun
neurohacking
ethical-social-media
approximation-algorithms-part-1
personalizedmed
duke-programming-web
music-and-social-action
c-plus-plus-b
healthcare-delivery-providers
tech-startup
film-images
economy-russia-transition
teaching-learning-tools
performance-assessment
humanitarian-communication
macroeconomic-factors
chronic-pain
html
python-databases
materials-structures
entrepreneurship-development
general-relativity
teaching-learning
marketingplan
forecasting-models-marketing-decisions
r-capstone
gis-capstone
teachingscience
health-care-innovation
health-che
management-skills-international-business
ebola-virus
resume-writing
how-to-teach-us
philanthropist
marketing-strategy-entrepreneurs
biostatistics-2
analyze
yingyuyanjiang
noun-clauses-conditionals
strategy-law-ethics
ideas
valuation-multiples
sport-sponsorship
enumerative-combinatorics
augmented-reality
presentations-speaking-so-that-people-listen
cloud-computing-adoption
digital-thread-implement

In [42]:
m=0
n=5
keys = list(top_.keys())
print(keys[m:n])

['matlab', 'game-development', 'project-management-basics', 'science-of-meditation', 'algorithmic-toolbox']


In [47]:
df_by_course_title = pd.DataFrame()
m=0
n=5 # задаём количество курсов для вывода
    
# выводим топ  курсов по средним оценкам и количеству рейтингов с названиями и ссылками на курсы
print('Топ курсов по средним оценкам и количеству рейтингов: ')
for k in dict_keys[m:n]: 
    filter_course_id = data['course_id'] == k
    df_by_course_id = data.loc[filter_course_id][0:1]
    df_by_course_title = df_by_course_title.append(df_by_course_id[['title', 'url']])
    
for i in range(n): 
    a = df_by_course_title.iloc[i].values
    print(a[0], ' ', a[1])  

Топ курсов по средним оценкам и количеству рейтингов: 
Introduction to Programming with MATLAB   https://www.coursera.org/learn/matlab
Introduction to Game Development   https://www.coursera.org/learn/game-development
Project Management: The Basics for Success   https://www.coursera.org/learn/project-management-basics
Buddhism and Modern Psychology    https://www.coursera.org/learn/science-of-meditation
Algorithmic Toolbox   https://www.coursera.org/learn/algorithmic-toolbox


In [51]:
# добавим ссылки - выводим топ n курсов по средним оценкам и количеству рейтингов с названиями и ссылками на курсы
df_by_course_title = pd.DataFrame()
m=0
n=5 # задаём количество курсов для вывода
    
# выводим топ  курсов по средним оценкам и количеству рейтингов с названиями и ссылками на курсы
print('Топ курсов по средним оценкам и количеству рейтингов: ')
for k in dict_keys[m:n]: 
    filter_course_id = data['course_id'] == k
    df_by_course_id = data.loc[filter_course_id][0:1]
    df_by_course_title = df_by_course_title.append(df_by_course_id[['title', 'url']])
    
for i in range(n): 
    a = df_by_course_title.iloc[i].values
    print(a[0], ' ', a[1])
    

Топ курсов по средним оценкам и количеству рейтингов: 
Introduction to Programming with MATLAB   https://www.coursera.org/learn/matlab
Introduction to Game Development   https://www.coursera.org/learn/game-development
Project Management: The Basics for Success   https://www.coursera.org/learn/project-management-basics
Buddhism and Modern Psychology    https://www.coursera.org/learn/science-of-meditation
Algorithmic Toolbox   https://www.coursera.org/learn/algorithmic-toolbox


#### Проверяем релевантность выдачи для пользователя
вместе со списком топовых курсов выдаём вопрос "уже учил?" 
- "нет" - переходи по ссылке на курс
- "да" - выдаём следующие 5 по топу популярности n+=5

#### list(sorted(course_new_mark, key=lambda x: x[1], reverse=True)[5:10])

и так далее

In [54]:
'''Are you satisfied? Print: Yes / No'''
df_by_course_title = pd.DataFrame()
m=0
n=5# задаём количество курсов для вывода
yes_no_answer = 'No'
df_by_course_title = pd.DataFrame()

    
while yes_no_answer == 'No' or yes_no_answer == 'no':
        # внутри цикла повторяем смещение выбранного слайса списка отсортированных курсов для вывода результата рекомендации, пока ответ не = 'да'
        # или пока не дойдём до конца списка отсортированных курсов
    df_by_course_title = pd.DataFrame()
    
    # выводим топ n курсов по средним оценкам и количеству рейтингов с названиями и ссылками на курсы
    print('I highly recommend: ')
    for k in dict_keys[m:n]:
        filter_course_id = data['course_id'] == k
        df_by_course_id = data.loc[filter_course_id][0:1]
        df_by_course_title = df_by_course_title.append(df_by_course_id[['title', 'url']])

    for i in range(0,n-m):
        a = df_by_course_title.iloc[i].values
        print(a[0], ' ', a[1])  
    
    print('---')
    print('Are you satisfied? Print: Yes / No')
    yes_no_answer = input()

    
    m+=5
    n+=5 # задаём смещение выбранного слайса списка отсортированных курсов
    

I highly recommend: 
Introduction to Programming with MATLAB   https://www.coursera.org/learn/matlab
Introduction to Game Development   https://www.coursera.org/learn/game-development
Project Management: The Basics for Success   https://www.coursera.org/learn/project-management-basics
Buddhism and Modern Psychology    https://www.coursera.org/learn/science-of-meditation
Algorithmic Toolbox   https://www.coursera.org/learn/algorithmic-toolbox
---
Are you satisfied? Print: Yes / No
no
I highly recommend: 
Mindshift: Break Through Obstacles to Learning and Discover Your Hidden Potential   https://www.coursera.org/learn/mindshift
Getting and Cleaning Data   https://www.coursera.org/learn/data-cleaning
Teach English Now! Theories of Second Language Acquisition   https://www.coursera.org/learn/language-theories
A Crash Course in Data Science   https://www.coursera.org/learn/data-science-course
Learn to Program: The Fundamentals   https://www.coursera.org/learn/learn-to-program
---
Are you sa

### Рекомендация курсов на основании запроса пользователя
- для нового пользователя
- по запросу рекомендации лучшего курса по подразделам наук/знаний (topics)

In [55]:
joined_users_reviews.head()

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url
0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
2,2-speed-it,Viswas P,4,Videos that are presented in French could've b...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
3,2-speed-it,AN L,3,"The course content is quite good, though it co...",Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it
4,2-speed-it,Konstantin A,5,"Great piece of work, I especially liked a few ...",Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it


In [166]:
#из списка наиболее популярных тем новый пользователь выбирает интересующую его область/подразделам наук/знаний (topics)
joined_users_reviews['topics'].value_counts().head(15)

Data Science Data Analysis                                                   15401
Arts and Humanities Music and Art                                            11052
Computer Science Software Development                                        10297
Business Leadership and Management                                            8847
Business Finance                                                              7702
Business Business Essentials                                                  7077
Computer Science Mobile and Web Development                                   6955
Language Learning Learning English                                            6172
Business Marketing                                                            5742
Business Business Strategy                                                    5618
Health Basic Science                                                          5398
Health Psychology                                                             4905
Phys

In [167]:
#либо вводит вручную 
#ZAPROS = input()  # считываем строку и кладём её в переменную ZAPROS
#например
ZAPROS = "health"

In [168]:
#выберем строки (отзывы), чье значение столбца 'topics' равно ZAPROS
data_Z = data[data.topics.str.contains(ZAPROS, case=False)]
data_Z = data_Z[data_Z.average_score>=4.5]

data_Z.head(3)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url
383,abdomen-anatomy,Mujahid A M B,5,"Creat course, thank you",Anatomy of the Abdomen and Pelvis; a journey f...,Health Basic Science,In this anatomy course you will explore the or...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,English,41565 already enrolled! 12090 recent views,100% online Flexible deadlines Approx. 33 hour...,4.7,92,92,,Introduction Mapping the abdomen and pelvis Tr...,cancer-metastasis cancer,https://www.coursera.org/learn/abdomen-anatomy
384,abdomen-anatomy,Antonia S P,5,The explanations are so clear! The images help...,Anatomy of the Abdomen and Pelvis; a journey f...,Health Basic Science,In this anatomy course you will explore the or...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,English,41565 already enrolled! 12090 recent views,100% online Flexible deadlines Approx. 33 hour...,4.7,92,92,,Introduction Mapping the abdomen and pelvis Tr...,cancer-metastasis cancer,https://www.coursera.org/learn/abdomen-anatomy
385,abdomen-anatomy,Ivan E P C,5,Excellent!!! This course explain difficult and...,Anatomy of the Abdomen and Pelvis; a journey f...,Health Basic Science,In this anatomy course you will explore the or...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,English,41565 already enrolled! 12090 recent views,100% online Flexible deadlines Approx. 33 hour...,4.7,92,92,,Introduction Mapping the abdomen and pelvis Tr...,cancer-metastasis cancer,https://www.coursera.org/learn/abdomen-anatomy


In [175]:
# возьмем топ курсов по средним оценкам 
top_courses_Z = data_Z.groupby('course_id')[['average_score']].max().sort_values('average_score', ascending=False)
top_courses_Z.head()
# получим простую базовую рекомендацию на основании выбора пользователем topics (области научного знания)

Unnamed: 0_level_0,average_score
course_id,Unnamed: 1_level_1
neurobiology,4.9
mountains-101,4.9
thoracic-oncology,4.9
theropods-birds,4.9
dino101,4.9


### Рекомендация курса пользователю на основании текстового запроса пользователя, введённого в свободной форме)
- Рекомендация на основании названия курса и текста описания курса

#### Токенизация и очистка данных
Сделаем токенизацию слов из текстов title и about

In [170]:
#пользователь вводит вручную текст запроса в свободной форме на английском языке
#text_ZAPROS = input()  # считываем строку и кладём её в переменную ZAPROS
#например
text_ZAPROS = "I want a course about ML))) and to read more!!!!! about neural networks"

In [171]:
import pymorphy2
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem

In [172]:
class MorphProvider:
    def __init__(self):
        self.cache = {}
        self.morph = pymorphy2.MorphAnalyzer()
    
    def __call__(self, w):
        w = w.lower()
        cached = self.cache.get(w)
        if cached:
            return cached
        try:
            morphed = self.morph.parse(w)[0].normal_form
            self.cache[w] = morphed
            return morphed
        except:
            return None
    
    def morph_string(self, s):
        words = word_tokenize(s)
        return " ".join([self.__call__(w) for w in words])

In [173]:
morph = MorphProvider()

In [174]:
data["title_normalized"] = [morph.morph_string(t) for t in tqdm(data.title)]

100%|██████████| 159253/159253 [00:16<00:00, 9627.94it/s] 


In [26]:
#data["about_normalized"] = [morph.morph_string(t) for t in tqdm(data.about)]

In [27]:
data.head(2)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,title_normalized
0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,two speed it : how companies can surf the digi...
1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,English,16728 already enrolled! 5149 recent views,100% online Flexible deadlines Approx. 21 hour...,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,two speed it : how companies can surf the digi...


In [28]:
import re
regex = re.compile("['A-Za-z\-]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""

In [29]:
#на всякий случай очистим от лишних смволов текст запроса
text=words_only(text_ZAPROS)
text

'I want a course about ML and to read more about neural networks'

In [30]:
morph.morph_string(text)
text

'I want a course about ML and to read more about neural networks'

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
проверить расстояние между текстом запроса и skills + topics

In [31]:
#выберем строки (отзывы), чье значение столбца 'topics' равно ZAPROS
#data_Z = data[data.about_normalized.str.contains(text_lemma, case=False)]
#data_Z.head(3)

### Рекомендация курсов по именам авторов/лекторов

In [32]:
#пользователь вводит вручную текст запроса - имя автора/лектора
#instructors_ZAPROS = input()  # считываем строку и кладём её в переменную ZAPROS
#например
instructors_ZAPROS = "Alex Mannella"

In [33]:
data['instructors'].value_counts().head(15)

Jeff Leek PhD Brian Caffo PhD Roger D. Peng PhD     1695
Kenny Wong                                          1665
Jeff Leek PhD Roger D. Peng PhD Brian Caffo PhD     1603
Colleen van Lent Ph.D. Charles Severance            1570
Dr. Shane Dixon Dr. Justin Shewell Jessica Cinco    1515
Ian Harris                                          1488
Alex Mannella                                       1462
Rob Stone PMP M.Ed.                                 1218
Charles Severance                                   1129
Randy Hlavac                                        1127
Nick Santos                                         1115
Rebekah May                                         1101
Roger D. Peng PhD Jeff Leek PhD Brian Caffo PhD     1100
Mine Çetinkaya-Rundel                               1069
Margaret  Meloni  MBA PMP                           1053
Name: instructors, dtype: int64

In [34]:
data["instructors"] = [morph.morph_string(t) for t in tqdm(data.instructors)]

100%|██████████| 159253/159253 [00:19<00:00, 8002.34it/s] 


In [35]:
data["instructors"].head()

0    antoine gourévitch vanessa lyon eric baudson
1    antoine gourévitch vanessa lyon eric baudson
2    antoine gourévitch vanessa lyon eric baudson
3    antoine gourévitch vanessa lyon eric baudson
4    antoine gourévitch vanessa lyon eric baudson
Name: instructors, dtype: object

In [36]:
#выберем строки (отзывы), чье значение столбца 'instructors' содержит введённое имя лектора
data_Z_instructors = data[data.instructors.str.contains(instructors_ZAPROS, case=False)]
data_Z_instructors = data_Z_instructors[data_Z_instructors.average_score>=4.5]

data_Z_instructors.head(3)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,title_normalized
2414,advanced-excel,Utsav S,5,Nicely planned course with good practice cases...,Data Visualization with Advanced Excel,Data Science Data Analysis,"In this course, you will get hands-on instruct...",alex mannella,English,45710 already enrolled! 55252 recent views,Course 3 of 5 in the 100% online Flexible dead...,4.8,232,232,Dashboard (Business) Microsoft Excel Data Virt...,Preparing a Professional Excel Advanced Scena...,dataviz-visual-analytics dataviz-design,https://www.coursera.org/learn/advanced-excel,data visualization with advanced excel
2415,advanced-excel,prashant s,5,course is very nice plese give the link for ce...,Data Visualization with Advanced Excel,Data Science Data Analysis,"In this course, you will get hands-on instruct...",alex mannella,English,45710 already enrolled! 55252 recent views,Course 3 of 5 in the 100% online Flexible dead...,4.8,232,232,Dashboard (Business) Microsoft Excel Data Virt...,Preparing a Professional Excel Advanced Scena...,dataviz-visual-analytics dataviz-design,https://www.coursera.org/learn/advanced-excel,data visualization with advanced excel
2416,advanced-excel,Inal K,5,Dashboards are extremely useful and effective ...,Data Visualization with Advanced Excel,Data Science Data Analysis,"In this course, you will get hands-on instruct...",alex mannella,English,45710 already enrolled! 55252 recent views,Course 3 of 5 in the 100% online Flexible dead...,4.8,232,232,Dashboard (Business) Microsoft Excel Data Virt...,Preparing a Professional Excel Advanced Scena...,dataviz-visual-analytics dataviz-design,https://www.coursera.org/learn/advanced-excel,data visualization with advanced excel


In [178]:
# возьмем топ курсов по оценкам 
top_courses_Z_instructors = data_Z_instructors.groupby('course_id')[['average_score']].max().sort_values(
                                                                                        'average_score', ascending=False)
top_courses_Z_instructors.head()
# получим простую базовую рекомендацию на основании выбора пользователем instructors (имя автора/лектора)

Unnamed: 0_level_0,average_score
course_id,Unnamed: 1_level_1
advanced-excel,4.8
excel-analysis,4.7
decision-making,4.6
powerpoint-presentations,4.5


### Рекомендация курса пользователю на основании интересов пользователя (ранее оценённых им курсов)
предсказываем оценку, которую поставил бы пользователь не просмотренному им курсу на основании регрессии, затем выбираем курс с вероятной наивысшей оценкой
- на фичах: TF-IDF на topics & skills

- средние оценки (+ median, variance, etc.) пользователя

In [38]:
data_t_s = data[['course_id', 'reviewer_name', 'rating', 'topics', 
                 'average_score', 'ratings_count', 'reviews_count', 'skills',]]
data_t_s.head(100)

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills
0,2-speed-it,Ravish,5,Business Business Essentials,4.4,33,33,
1,2-speed-it,Etienne R,2,Business Business Essentials,4.4,33,33,
2,2-speed-it,Viswas P,4,Business Business Essentials,4.4,33,33,
3,2-speed-it,AN L,3,Business Business Essentials,4.4,33,33,
4,2-speed-it,Konstantin A,5,Business Business Essentials,4.4,33,33,
5,2-speed-it,Eduardo F,5,Business Business Essentials,4.4,33,33,
6,2-speed-it,Daniel D,4,Business Business Essentials,4.4,33,33,
7,2-speed-it,Rémy C,5,Business Business Essentials,4.4,33,33,
8,2-speed-it,Krishna K,4,Business Business Essentials,4.4,33,33,
9,2-speed-it,Alexandre d M L,4,Business Business Essentials,4.4,33,33,


In [39]:
data_t_s['key_words'] = data_t_s.topics.map(str) + ' ' + data_t_s.skills
data_t_s.head()

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills,key_words
0,2-speed-it,Ravish,5,Business Business Essentials,4.4,33,33,,Business Business Essentials
1,2-speed-it,Etienne R,2,Business Business Essentials,4.4,33,33,,Business Business Essentials
2,2-speed-it,Viswas P,4,Business Business Essentials,4.4,33,33,,Business Business Essentials
3,2-speed-it,AN L,3,Business Business Essentials,4.4,33,33,,Business Business Essentials
4,2-speed-it,Konstantin A,5,Business Business Essentials,4.4,33,33,,Business Business Essentials


In [40]:
#data_t_s = data_t_s.loc[(data_t_s['skills'] != 'NaN') & (data_t_s['skills'] != '')]
data_t_s.tail(1000)

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills,key_words
158253,women-in-leadership,Яна Ч,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158254,women-in-leadership,MAGALI A S,3,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158255,women-in-leadership,Ted B,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158256,women-in-leadership,Shelina R,4,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158257,women-in-leadership,Lilija M,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158258,women-in-leadership,Giada B,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158259,women-in-leadership,Roopali S,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158260,women-in-leadership,Angélica T,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158261,women-in-leadership,Merrill C,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...
158262,women-in-leadership,Claude D,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...


In [41]:
data_t_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159253 entries, 0 to 159252
Data columns (total 9 columns):
course_id        159253 non-null object
reviewer_name    159253 non-null object
rating           159253 non-null int64
topics           159253 non-null object
average_score    159253 non-null float64
ratings_count    159253 non-null int64
reviews_count    159253 non-null int64
skills           159253 non-null object
key_words        159253 non-null object
dtypes: float64(1), int64(3), object(5)
memory usage: 10.9+ MB


#### Токенизация и очистка данных
Сделаем токенизацию слов из текстов topics и skills

In [42]:
mystopwords = stopwords.words('english') + ["i'm", '-', "i've"] 
regex = re.compile("['A-Za-z\-]+")

def tokenize(text, regex=regex, stopwords=mystopwords):
    """ Tokenize all tokens from text string
        Returns array of tokens
    """
    try:
        text = " ".join(regex.findall(text)).lower()
        tokens = ' '.join([token for token in text.split(' ') if not token in stopwords])
        return tokens
    except:
        return []

In [43]:
data_t_s['key_words_tokenize'] = data_t_s.key_words.apply(tokenize)
data_t_s.tail(1000)

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills,key_words,key_words_tokenize
158253,women-in-leadership,Яна Ч,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158254,women-in-leadership,MAGALI A S,3,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158255,women-in-leadership,Ted B,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158256,women-in-leadership,Shelina R,4,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158257,women-in-leadership,Lilija M,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158258,women-in-leadership,Giada B,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158259,women-in-leadership,Roopali S,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158260,women-in-leadership,Angélica T,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158261,women-in-leadership,Merrill C,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...
158262,women-in-leadership,Claude D,5,Business Leadership and Management,4.6,37,37,Assertiveness Communication Negotiation Leader...,Business Leadership and Management Assertivene...,business leadership management assertiveness c...


In [44]:
data_t_s_1 = data_t_s.head(1000)
data_t_s_tok = data_t_s_1
data_t_s_tok['key_words_tokenize']

0                           business business essentials
1                           business business essentials
2                           business business essentials
3                           business business essentials
4                           business business essentials
5                           business business essentials
6                           business business essentials
7                           business business essentials
8                           business business essentials
9                           business business essentials
10                          business business essentials
11                          business business essentials
12                          business business essentials
13                          business business essentials
14                          business business essentials
15                          business business essentials
16                          business business essentials
17                          bus

In [45]:
key_words = []
for i in tqdm_notebook(data_t_s_tok.key_words_tokenize.str.split(' ')):
        for j in i :
            key_words.append(j)

dict_key_words_idf = {i:np.log(len(data_t_s_tok)/key_words.count(i)) for i in key_words}
dict_key_words_idf

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




{'business': 0.5481814103097596,
 'essentials': 3.4420193761824107,
 'entrepreneurship': 1.0469690555162712,
 'materials': 1.5847452998437288,
 'product': 0.8915981192837835,
 'development': 0.6891551592904078,
 'new': 1.5847452998437288,
 'human': 1.5847452998437288,
 'computer': 1.5847452998437288,
 'interaction': 1.5847452998437288,
 'health': 2.4304184645039304,
 'basic': 2.4304184645039304,
 'science': 2.4304184645039304,
 'arts': 1.4354846053106625,
 'humanities': 1.4354846053106625,
 'music': 0.7423374247507171,
 'art': 1.4354846053106625,
 'audio': 1.4354846053106625,
 'recording': 1.4354846053106625,
 'file': 1.4354846053106625,
 'management': 0.9137938516755679,
 'mixing': 1.4354846053106625,
 'abelton': 1.4354846053106625,
 'live': 1.4354846053106625,
 'midi': 1.4354846053106625,
 'programming': 1.4354846053106625,
 'social': 3.4420193761824107,
 'sciences': 3.4420193761824107,
 'education': 3.4420193761824107,
 'language': 2.995732273553991,
 'learning': 2.302585092994046,


In [46]:
len(sorted(dict_key_words_idf.items(), key=lambda kv: kv[1]) )

37

In [47]:
for i in dict_key_words_idf:
    data_t_s_tok['tf_idf_'+i] = data_t_s_tok.apply(lambda row: 
                                   (1/len(row['key_words'].split(' ')))*dict_key_words_idf[i]
                                   if i in row['key_words'] else 0, axis=1)

In [48]:
data_t_s_tok.head()

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills,key_words,key_words_tokenize,...,tf_idf_sciences,tf_idf_education,tf_idf_language,tf_idf_learning,tf_idf_english,tf_idf_personal,tf_idf_finance,tf_idf_accounting,tf_idf_analytics,tf_idf_earnings
0,2-speed-it,Ravish,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,0
1,2-speed-it,Etienne R,2,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,0
2,2-speed-it,Viswas P,4,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,0
3,2-speed-it,AN L,3,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,0
4,2-speed-it,Konstantin A,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,0


In [49]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [50]:
label_encoder = LabelEncoder()
data_t_s_1['new_course_id'] = pd.Series(label_encoder.fit_transform(data_t_s_tok['course_id']))
data_t_s_tok['new_course_id'] = pd.Series(label_encoder.fit_transform(data_t_s_tok['course_id']))
data_t_s_tok['new_course_id'].value_counts()

5     238
2     205
11    163
4      88
1      88
3      58
10     46
6      32
0      32
8      27
9      17
7       6
Name: new_course_id, dtype: int64

In [51]:
data_t_s_tok['reviewer_id'] = pd.Series(label_encoder.fit_transform(data_t_s_tok['reviewer_name']))
data_t_s_tok['reviewer_id'].value_counts().head()

691    3
911    3
174    3
729    3
101    3
Name: reviewer_id, dtype: int64

In [52]:
data_t_s_tok

Unnamed: 0,course_id,reviewer_name,rating,topics,average_score,ratings_count,reviews_count,skills,key_words,key_words_tokenize,...,tf_idf_language,tf_idf_learning,tf_idf_english,tf_idf_personal,tf_idf_finance,tf_idf_accounting,tf_idf_analytics,tf_idf_earnings,new_course_id,reviewer_id
0,2-speed-it,Ravish,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,684
1,2-speed-it,Etienne R,2,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,237
2,2-speed-it,Viswas P,4,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,823
3,2-speed-it,AN L,3,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,5
4,2-speed-it,Konstantin A,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,456
5,2-speed-it,Eduardo F,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,213
6,2-speed-it,Daniel D,4,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,172
7,2-speed-it,Rémy C,5,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,705
8,2-speed-it,Krishna K,4,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,458
9,2-speed-it,Alexandre d M L,4,Business Business Essentials,4.4,33,33,,Business Business Essentials,business business essentials,...,0,0,0,0,0,0,0,0,0,48


In [53]:
data_t_s_tok = data_t_s_tok.drop(['course_id'], axis=1)
data_t_s_tok = data_t_s_tok.drop(['reviewer_name'], axis=1)
data_t_s_tok = data_t_s_tok.drop(['key_words'], axis=1)
data_t_s_tok = data_t_s_tok.drop(['key_words_tokenize'], axis=1)
data_t_s_tok = data_t_s_tok.drop(['topics'], axis=1)
data_t_s_tok = data_t_s_tok.drop(['skills'], axis=1)

In [54]:
data_t_s_tok.tail(50)

Unnamed: 0,rating,average_score,ratings_count,reviews_count,tf_idf_business,tf_idf_essentials,tf_idf_entrepreneurship,tf_idf_materials,tf_idf_product,tf_idf_development,...,tf_idf_language,tf_idf_learning,tf_idf_english,tf_idf_personal,tf_idf_finance,tf_idf_accounting,tf_idf_analytics,tf_idf_earnings,new_course_id,reviewer_id
950,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,887
951,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,12
952,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,713
953,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,760
954,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,711
955,4,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,644
956,4,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,548
957,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,770
958,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,469
959,5,4.5,315,315,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,575


In [55]:
#разделим обучающую и тестовую выборки
from sklearn.model_selection import train_test_split

In [56]:
X = data_t_s_tok.drop(columns=['rating', 'average_score'])
y = data_t_s_tok['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [57]:
result = pd.DataFrame(y_test.reset_index(drop=True), columns=['target'])

## Линейная регрессия для предсказания оценки пользователя
Используем линейную регрессию в качестве бейзлайна, чтобы предсказать переменную - оценку пользователей (rating)

In [58]:
from sklearn.linear_model import LinearRegression # метод наименьших квадратов
from sklearn.metrics import mean_squared_log_error, mean_squared_error

In [59]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [60]:
predictions = model.predict(X_test)

In [61]:
mean_squared_error(predictions, y_test)

0.7711057362460693

In [62]:
X_test['predictions'] = predictions

In [63]:
X_test['LinearRegression_predictions'] = X_test['predictions']
models_res = X_test['LinearRegression_predictions']
result['LinearRegression'] = models_res #для визуализации

In [64]:
results = X_test.merge(data_t_s_1, how='left', on='new_course_id')[
                                                        [ 'reviewer_name' ,'course_id', 'predictions', 'rating',]]
results.sort_values('predictions', ascending=False).head()

Unnamed: 0,reviewer_name,course_id,predictions,rating
28825,Thomas,ableton-live,4.795733,4
28807,Thomas J,ableton-live,4.795733,5
28795,Graham M,ableton-live,4.795733,5
28796,Alberto B,ableton-live,4.795733,5
28797,Gonza M,ableton-live,4.795733,5


#### Предскажем оценки пользователей используя RandomForestRegressor

In [65]:
from sklearn.ensemble import RandomForestRegressor

In [66]:
X = data_t_s_tok.drop(columns=['rating', 'average_score'])
y = data_t_s_tok['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [67]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [68]:
predictions = model.predict(X_test)
X_test['RandomForestRegressor_predictions'] = predictions
models_res.RandomForestRegressor_predictions = X_test['RandomForestRegressor_predictions']

print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, predictions)))

root_mean_squared_error =  0.9202480543209431


In [69]:
print(model.feature_importances_)

[0.03833874 0.03579585 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.00643615
 0.         0.         0.         0.00203631 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.07570934 0.84168361]


#### Попробуем применить к исходным key_words CountVectorizer и TfidfTransformer

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [71]:
cv_data_t_s_tok = CountVectorizer()
tf_data_t_s_tok = TfidfTransformer()

In [72]:
data_t_s_tok_ = cv_data_t_s_tok.fit_transform(data_t_s_1.key_words_tokenize)

In [73]:
cv_data_t_s_tok.get_feature_names()

['abelton',
 'accounting',
 'analytics',
 'art',
 'arts',
 'audio',
 'basic',
 'business',
 'computer',
 'development',
 'earnings',
 'education',
 'english',
 'entrepreneurship',
 'essentials',
 'file',
 'finance',
 'health',
 'human',
 'humanities',
 'interaction',
 'language',
 'learning',
 'live',
 'management',
 'materials',
 'midi',
 'mixing',
 'music',
 'new',
 'personal',
 'product',
 'programming',
 'recording',
 'science',
 'sciences',
 'social']

In [74]:
len(cv_data_t_s_tok.get_feature_names())

37

In [75]:
tfidf_data_t_s_tok= tf_data_t_s_tok.fit_transform(data_t_s_tok_)

In [76]:
tfidf_data_t_s_tok

<1000x37 sparse matrix of type '<class 'numpy.float64'>'
	with 6875 stored elements in Compressed Sparse Row format>

In [77]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_data_t_s_tok, y, test_size=0.3,random_state=42)

In [78]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [79]:
print(model.feature_importances_)

[0.00000000e+00 1.66919854e-03 1.60302690e-02 9.37147062e-04
 1.16818027e-02 2.02054117e-04 5.74564863e-05 6.90237667e-01
 0.00000000e+00 2.40992667e-02 4.35343264e-04 7.94804685e-02
 3.03449272e-03 6.09582629e-02 1.82121451e-02 3.96906843e-04
 1.00300725e-02 4.88767495e-03 7.48451378e-04 0.00000000e+00
 0.00000000e+00 1.45604140e-02 1.12151676e-02 0.00000000e+00
 1.16719192e-04 7.41916010e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.38798547e-03 1.37138220e-02 0.00000000e+00
 0.00000000e+00 6.27895130e-04 5.47831274e-03 1.49617731e-02
 1.47650397e-02]


In [80]:
predictions1 = model.predict(X_test)
#X_test['RandomForestRegressor'] = model.predict(X_test)
#models_res.RandomForestRegressor1 = X_test['RandomForestRegressor1']
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, predictions1)))

root_mean_squared_error =  0.8320153589129327


#### Попробуем применить GridSearch к моделям

In [81]:
from sklearn.model_selection import GridSearchCV

In [82]:
%%time
lr_params = {
    'fit_intercept':[False, True]
}

lr = LinearRegression()
grid_lr = GridSearchCV(lr, lr_params,
                       scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_lr.fit(X_train, y_train)

print(grid_lr.best_params_)
print(grid_lr.best_score_)
print(grid_lr.best_estimator_)

{'fit_intercept': True}
-0.6704581691859347
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
CPU times: user 46.7 ms, sys: 61.4 ms, total: 108 ms
Wall time: 3.93 s


In [83]:
#Оценим тчоностьна тесте
grid_lr.score(X_test,y_test)

-0.6949992592228276

In [84]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_lr.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.8336661557379114


In [85]:
np.sqrt(-grid_lr.score(X_test,y_test))

0.8336661557379114

#### Предскажем оценки пользователей используя KNeighborsRegressor

In [86]:
from sklearn.model_selection import RandomizedSearchCV

In [87]:
RandomizedSearchCV

sklearn.model_selection._search.RandomizedSearchCV

In [88]:
from sklearn.neighbors import KNeighborsRegressor

In [89]:
%%time

knn_params = {
    'n_neighbors':list(range(1, 30))
   ,'weights': ['uniform', 'distance']
   ,'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
   ,'leaf_size':list(range(5, 30))
}

grid_knn = RandomizedSearchCV(KNeighborsRegressor(), knn_params,  scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_knn.fit(X_train, y_train)
print(grid_knn.best_params_)
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

{'weights': 'distance', 'n_neighbors': 25, 'leaf_size': 20, 'algorithm': 'ball_tree'}
-0.6829959086213447
KNeighborsRegressor(algorithm='ball_tree', leaf_size=20, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=25, p=2,
          weights='distance')
CPU times: user 52.6 ms, sys: 6.42 ms, total: 59 ms
Wall time: 335 ms




In [90]:
grid_knn.score(X_test,y_test)

-0.691242279964622

In [91]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_knn.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.8314098146910596


#### Предскажем оценки пользователей используя DecisionTreeRegressor

In [92]:
from sklearn.tree import DecisionTreeRegressor

In [93]:
%%time
dt_params = {
    'max_depth':[None,1,2,5,10,25,50],
     'min_samples_split':[2,5,8,10,25,50],
    
    'min_weight_fraction_leaf': [0, 0.01, 0.1, 0.15, 0.25, 0.5] ,
    'min_samples_leaf':list(range(1, 10)),
     'criterion':  ['mse', 'friedman_mse', 'mae'],
    'max_features':list(range(1, 13)) }
#model_forest = RandomForestRegressor(
    

grid_dt =  RandomizedSearchCV(DecisionTreeRegressor(),dt_params,  scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_dt.fit(X_train, y_train)
print(grid_dt.best_params_)
print(grid_dt.best_score_)
print(grid_dt.best_estimator_)

{'min_weight_fraction_leaf': 0.1, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 10, 'max_depth': 5, 'criterion': 'friedman_mse'}
-0.665238211666703
DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, max_features=10,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=4,
           min_samples_split=10, min_weight_fraction_leaf=0.1,
           presort=False, random_state=None, splitter='best')
CPU times: user 38.2 ms, sys: 2.81 ms, total: 41 ms
Wall time: 227 ms


In [94]:
grid_dt.score(X_test,y_test)

-0.6925337264640157

In [95]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_dt.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.8321861128762097


#### Предскажем оценки пользователей используя RandomForestRegressor

In [96]:
%%time
rf_params = {
    'n_estimators': [1,10,20,30,40,50,60,80,90],
    
    'max_depth':[None,1,2,5,10,25,50],
    
    'min_samples_leaf':list(range(1, 10)),
   
    'max_features':list(range(1, 13)) ,
     'criterion':  ['mse', 'friedman_mse', 'mae']}
    

grid_rf =  RandomizedSearchCV(RandomForestRegressor(),rf_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_rf.fit(X_train, y_train)
print(grid_rf.best_params_)
print(grid_rf.best_score_)
print(grid_rf.best_estimator_)

{'n_estimators': 30, 'min_samples_leaf': 9, 'max_features': 6, 'max_depth': 2, 'criterion': 'mse'}
-0.6637947498018606
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=9,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
CPU times: user 114 ms, sys: 8.12 ms, total: 122 ms
Wall time: 1.81 s


In [97]:
grid_rf.score(X_test,y_test)

-0.6936030491545594

In [98]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_rf.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.8328283431503514


#### Предскажем оценки пользователей используя GradientBoostingRegressor

In [99]:
%%time
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
gb_params = {
          'n_estimators': [1,10,20,30,40,50,60,80,90],
          'max_features': list(range(1, 13)),
             'max_depth':[None,1,2,5,10,25,50],
            'learning_rate': [0.1,0.3,0.5,0.7],
               #'min_samples_split':[2,5,8,10,25,50],
            'min_samples_leaf':list(range(1, 10)),}
    
grid_gb =  RandomizedSearchCV(GradientBoostingRegressor(),gb_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_gb.fit(X_train, y_train)
print(grid_gb.best_params_)
print(grid_gb.best_score_)
print(grid_gb.best_estimator_)

{'n_estimators': 60, 'min_samples_leaf': 4, 'max_features': 7, 'max_depth': 1, 'learning_rate': 0.1}
-0.6668181210906947
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=1, max_features=7,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=4,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=60, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
CPU times: user 66.8 ms, sys: 3.65 ms, total: 70.4 ms
Wall time: 597 ms


In [100]:
grid_gb.score(X_test,y_test)

-0.6920897760177771

In [101]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_gb.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.831919332638554


#### Предскажем оценки пользователей используя SVR

In [102]:
from sklearn.svm import SVR

In [103]:
%%time
SVR_params = {
          #'kernel':['linear', 'poly','rbf', 'sigmoid', 'precomputed'],
             'C' :[0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]}

grid_SVR =   GridSearchCV(SVR(),SVR_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_SVR.fit(X_train, y_train)
print(grid_SVR.best_params_)
print(grid_SVR.best_score_)
print(grid_SVR.best_estimator_)

{'C': 0.001, 'gamma': 0.1}
-0.7482732832688053
SVR(C=0.001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
CPU times: user 65.1 ms, sys: 4.26 ms, total: 69.3 ms
Wall time: 610 ms


In [104]:
grid_SVR.score(X_test,y_test)

-0.7806339780934861

In [105]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_SVR.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.8835349331483652


In [106]:
models=['SVR','gb','dt','rf','knn','lr']

In [107]:
for model_ in models:
    print(model_,'---','root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, eval(
                                                        'grid_'+model_+'.best_estimator_').predict(X_test))))


SVR --- root_mean_squared_error =  0.8835349331483652
gb --- root_mean_squared_error =  0.831919332638554
dt --- root_mean_squared_error =  0.8321861128762097
rf --- root_mean_squared_error =  0.8328283431503514
knn --- root_mean_squared_error =  0.8314098146910596
lr --- root_mean_squared_error =  0.8336661557379114


In [108]:
#видим, что нилучший результат у модели SVR

In [109]:
from plotly.offline import init_notebook_mode, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [110]:
# Визуализируем результаты обучения
"""
columns = result.columns

traces=[]
for i in columns:
    traces.append(go.Scatter(
                    x=result.index,
                    y=result[i],
                    name=i,
                    orientation = 'v')
                 )

layout = {'title': 'Models result'}
fig = go.Figure(data=traces, layout=layout)

iplot(fig, show_link=False)
"""

"\ncolumns = result.columns\n\ntraces=[]\nfor i in columns:\n    traces.append(go.Scatter(\n                    x=result.index,\n                    y=result[i],\n                    name=i,\n                    orientation = 'v')\n                 )\n\nlayout = {'title': 'Models result'}\nfig = go.Figure(data=traces, layout=layout)\n\niplot(fig, show_link=False)\n"

### Рекомендация курсов на основании содержания текста review

In [111]:
# Три варианта положительных комментариев
for i, text in data[data.rating==5].head(3).iterrows():
    print("Good comment: \n {0} \n".format(text['review_text']))

Good comment: 
 Very relevant and useful course designed for CIOs 

Good comment: 
 Great piece of work, I especially liked a few 'lifehacks' for the CIO 

Good comment: 
 Excellent course, for me it was very rewarding and the terms used and the tools given were excellent, and today and I put in use in my job, Thank you for inculcating knowledge and move on               



In [112]:
# Три варианта отрицательных(негативных) комментариев
for i, text in data[data.rating==1].head(3).iterrows():
    print("Bad comment: \n {0} \n".format(text['review_text']))

Bad comment: 
 Till now no assigment for my work on week 4. 

Bad comment: 
 This course doesn't contain any new information. It does not teach you but just excitedly shows commonly known facts.There are better ways to invest your time. 

Bad comment: 
 I do not find very interesting this course. too many interviews. It could works for the first course, but not for the second. I was expecting to have more technical material and lessons. 



### Токенизация и очистка данных
Сделаем токенизацию слов из текстов review

In [113]:
mystopwords = stopwords.words('english') + ["i'm", '-', "i've"] + ["\\", "\"", "'", "\'"]
regex = re.compile("['A-Za-z\-]+")

def tokenize(text, regex=regex, stopwords=mystopwords):
    """ Tokenize all tokens from text string
        Returns array of tokens
    """
    try:
        text = " ".join(regex.findall(text)).lower()
        tokens = ' '.join([token for token in text.split(' ') if not token in stopwords])
        return tokens
    except:
        return []

In [114]:
data['review_text_tokenize'] = data.review_text.apply(tokenize)

In [115]:
data['review_text_tokenize']

0                      relevant useful course designed cios
1         course say anything digitization core subject ...
2         videos presented french could've translated en...
3         course content quite good though could deeper ...
4         great piece work especially liked 'lifehacks' cio
5         excellent course rewarding terms used tools gi...
6         excellent representation day day thanks sharin...
7                            interesting well-designed mooc
8         completion course progress well reviews taking...
9         nice course macro ideias several areas pretty ...
10        really liked presentation slides really clear ...
11        un cours vraiment int ressant qui fait chos de...
12        expectation course huge many people told cours...
13        course really helpful understanding strategy o...
14        excellent course really learned lot role chall...
15        insightful course transformations backed solid...
16                                      

### Определим частоту слов построим облако слов для того чтобы понять о чем большинство текстов

In [116]:
from collections import Counter

lemmata = []
for index, row in data.iterrows():
    lemmata += row['review_text_tokenize'].split()
cnt = Counter(lemmata)

for i in cnt.most_common(15):
    print(i)

AttributeError: 'list' object has no attribute 'split'

In [121]:
# Количество слов в словаре:
print(len(cnt))

NameError: name 'cnt' is not defined

In [122]:
from wordcloud import *
word_freq = [i for i in cnt.most_common(100)]
wd = WordCloud(background_color = 'white')
wd.generate_from_frequencies(dict(word_freq))
plt.figure()
plt.imshow(wd, interpolation = 'bilinear')
plt.axis('off')
plt.show()

NameError: name 'cnt' is not defined

### Сформируем сбалансированный датасет c обучающей и тестовой выборкой

In [123]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

X = data[['review_text_tokenize']]
y = data['rating']

X_balanced, y_balanced = rus.fit_resample(X, y)

In [124]:
balanced = pd.DataFrame.from_dict({'review_text_tokenize': X_balanced[:,0], 'rating': y_balanced}) 
balanced.head()

Unnamed: 0,review_text_tokenize,rating
0,material extremely fragmented seems like instr...,1
1,poor design presentation assignment,1
2,totally unsufficient guidance external tools n...,1
3,find course added anything already learned see...,1
4,pesimo,1


In [125]:
# Разделим на обучающую и тестовую выборку
train, test = train_test_split(balanced, test_size=0.2, random_state=42)

### Построим векторную модель с помощью Doc2Vec

для предсказания пользовательских оценок

In [126]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [127]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['review_text_tokenize'].split(' '), tags=[r.rating]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['review_text_tokenize'].split(' '), tags=[r.rating]), axis=1)

In [128]:
train_tagged.values[30]

TaggedDocument(words=['interesting', 'course', 'way', 'many', 'quizzes', 'extremely', 'tedious', 'would', 'recommend', 'course', 'anyone'], tags=[2])

In [129]:
model_dbow = Doc2Vec(dm=0, vector_size=2000, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 13028/13028 [00:00<00:00, 1794528.49it/s]


In [130]:
# Обучаем модель
for epoch in range(10):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 13028/13028 [00:00<00:00, 1769597.22it/s]
100%|██████████| 13028/13028 [00:00<00:00, 1416402.51it/s]
100%|██████████| 13028/13028 [00:00<00:00, 1542727.06it/s]
100%|██████████| 13028/13028 [00:00<00:00, 2944148.30it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3329477.97it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3473391.34it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3156023.59it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3013643.97it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3363084.23it/s]
100%|██████████| 13028/13028 [00:00<00:00, 3412226.33it/s]


In [131]:
# Сформируем итоговый набор векторов для обучения
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [132]:
## Логистическая регрессия

In [133]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [134]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [135]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.39146453791832975
Testing F1 score: 0.38899449316649926


In [1]:
mean_squared_error(y_pred, y_test)

NameError: name 'mean_squared_error' is not defined