## Рекомендательная система для подбора образовательных курсов



Задача: рекомендовать пользователю наилучший образовательный курс по его запросу и предпочтениям
Идея: сравнивать образовательные курсы так же как сравнивают фильмы - по оценкам пользователей, жанрам (областям научного знания)

## Импорт библиотек

In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score

import seaborn as sns

import multiprocessing

import re

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

%matplotlib inline

In [2]:
#Загрузка данных
data = pd.read_csv('courses_eng_1.csv')
data=data.drop(['Unnamed: 0'], axis=1)
data['skills'] = data.skills.fillna(value = '')
data.head()

Unnamed: 0,course_id,title,topics,about,instructors,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range
0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0
1,3d-printing-applications,3D Printing Applications,Business Entrepreneurship,This course will help you understand how 3D pr...,Vishal Sachdev,4.5,92,92,,Course Orientation Module 1: 3D Printing – A N...,mechanics2 beam-bending,https://www.coursera.org/learn/3d-printing-app...,11308,8209,137.8,17.0,0.1
2,3d-printing-revolution,The 3D Printing Revolution,Business Entrepreneurship,This course will demonstrate how 3D printers w...,Aric Rindfleisch,4.7,221,221,Materials Product Development New Product Deve...,Course Orientation Module 1: What Is 3D Printi...,mechanics2 beam-bending,https://www.coursera.org/learn/3d-printing-rev...,20657,10843,190.5,12.0,0.1
3,3d-printing-software,3D Printing Software,Business Entrepreneurship,This course will demonstrate how to use 3D pri...,Jeffrey Smith,4.5,59,59,,Course Orientation Module 1: Design Sketching ...,mechanics2 beam-bending,https://www.coursera.org/learn/3d-printing-sof...,13266,20828,63.7,23.0,0.1
4,abdomen-anatomy,Anatomy of the Abdomen and Pelvis; a journey f...,Health Basic Science,In this anatomy course you will explore the or...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,4.7,92,92,,Introduction Mapping the abdomen and pelvis Tr...,cancer-metastasis cancer,https://www.coursera.org/learn/abdomen-anatomy,41565,12090,343.8,33.0,0.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 17 columns):
course_id                  1299 non-null object
title                      1299 non-null object
topics                     1299 non-null object
about                      1299 non-null object
instructors                1299 non-null object
average_score              1299 non-null float64
ratings_count              1299 non-null int64
reviews_count              1299 non-null int64
skills                     1299 non-null object
syllabus                   1299 non-null object
recommendations            1259 non-null object
url                        1299 non-null object
already_enrolled           1299 non-null int64
recent_views               1299 non-null int64
recent_views_conversion    1299 non-null float64
hours_to_complete          1298 non-null float64
level_range                1299 non-null float64
dtypes: float64(4), int64(4), object(9)
memory usage: 172.6+ KB


### Рекомендация курса пользователю на основании интересов пользователя (ранее оценённых им курсов)
предсказываем оценку, которую поставил бы пользователь не просмотренному им курсу на основании регрессии, затем выбираем курс с вероятной наивысшей оценкой
- на фичах: TF-IDF на topics & skills

- средние оценки (+ median, variance, etc.) пользователя

In [4]:
#объединим topics & skills в новую фичу key_words
data['key_words'] = data.topics.map(str) + ' ' + data.skills
data.tail(100)

Unnamed: 0,course_id,title,topics,about,instructors,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range,key_words
1199,trading-algorithm,Trading Algorithms,Business Finance,This course covers two of the seven trading st...,Prasanna Tantri,4.6,52,52,Trading Strategy Post-Earnings-Announcement Dr...,Module 1 - Introduction to Trading Strategies ...,biases-portfolio-selection portfolio-selection...,https://www.coursera.org/learn/trading-algorithm,15850,14353,110.4,7.0,0.5,Business Finance Trading Strategy Post-Earning...
1200,trading-basics,Trading Basics,Business Finance,The purpose of this course is to equip you wit...,Ramabhadran Thirumalai,4.1,105,105,Financial Ratio Cost Financial Statement Trading,Basics of Financial Statements Financial State...,biases-portfolio-selection portfolio-selection...,https://www.coursera.org/learn/trading-basics,31639,34043,92.9,13.0,0.1,Business Finance Financial Ratio Cost Financia...
1201,transmedia-storytelling,"Transmedia Storytelling: Narrative worlds, eme...",Arts and Humanities Music and Art,Transmedia storytelling is the practice of des...,Associate Professor Simon McIntyre Dr Ollie Bo...,4.8,77,77,,What is Transmedia Storytelling? Creative Idea...,craft-of-setting-and-description craft-of-char...,https://www.coursera.org/learn/transmedia-stor...,26301,33255,79.1,25.0,0.0,Arts and Humanities Music and Art
1202,transreality-gaming,iOS Project: Transreality Game,Computer Science Mobile and Web Development,Students will produce a networked game that wi...,Don Patterson Sam Kaufman,0.0,0,0,,The Project and the Plan Basic Scaffold Implem...,,https://www.coursera.org/learn/transreality-ga...,0,0,0.0,21.0,0.0,Computer Science Mobile and Web Development
1203,tricky-american-english-pronunciation,Tricky American English Pronunciation,Language Learning Learning English,"In this course, you’ll practice the sounds of ...",Tamy Chapman Marla Yoshida Brad Gilpin,4.7,114,114,,Welcome Tricky Consonant Sounds Tricky Vowel S...,conjunctions-connectives-adverb-clauses noun-c...,https://www.coursera.org/learn/tricky-american...,54548,58630,93.0,19.0,0.1,Language Learning Learning English
1204,tricky-english-grammar,Tricky English Grammar,Language Learning Learning English,English is a difficult language to learn becau...,Tamy Chapman Helen Nam Brad Gilpin,4.7,131,131,,Welcome Nouns Articles and Quantifiers Gerunds...,conjunctions-connectives-adverb-clauses noun-c...,https://www.coursera.org/learn/tricky-english-...,47923,43883,109.2,19.0,0.0,Language Learning Learning English
1205,truthinourbones-osteoarchaeology-archaeology,Osteoarchaeology: The Truth in Our Bones,Health Basic Science,This course is about what we can learn from ex...,Andrea Waters-Rist,4.7,97,97,,Introduction to the course Bones to Biograph...,early-vertebrate-evolution theropods-birds,https://www.coursera.org/learn/truthinourbones...,17408,11063,157.4,33.0,0.1,Health Basic Science
1206,types-of-conflict,Types of Conflict,Business Business Essentials,Conflict is everywhere and it is impossible to...,Najla DeBow,4.3,116,116,Assertiveness Active Listening Communication M...,Getting Started Different Types of Conflict Po...,coaching-conversations coaching-practices,https://www.coursera.org/learn/types-of-conflict,23876,6622,360.6,8.0,0.0,Business Business Essentials Assertiveness Act...
1207,typography,Introduction to Typography,Arts and Humanities Music and Art,Typography is the art of manipulating the visu...,Anther Kiley,4.8,500,500,Adobe Indesign History Creativity Graphics,Week 1: Talking Type Week 2: Typefaces and the...,photo-composition camera-control,https://www.coursera.org/learn/typography,56285,54624,103.0,10.0,0.1,Arts and Humanities Music and Art Adobe Indesi...
1208,ui,Best Practices for iOS User Interface Design,Computer Science Mobile and Web Development,You will learn to develop sophisticated user i...,Don Patterson Sam Kaufman,4.6,44,44,,Getting to know iOS Design Concepts The Design...,ios-app-design-development ios-app-development...,https://www.coursera.org/learn/ui,11798,2671,441.7,22.0,0.0,Computer Science Mobile and Web Development


In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
label_encoder = LabelEncoder()
data['new_course_id'] = pd.Series(label_encoder.fit_transform(data['course_id']))

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 19 columns):
course_id                  1299 non-null object
title                      1299 non-null object
topics                     1299 non-null object
about                      1299 non-null object
instructors                1299 non-null object
average_score              1299 non-null float64
ratings_count              1299 non-null int64
reviews_count              1299 non-null int64
skills                     1299 non-null object
syllabus                   1299 non-null object
recommendations            1259 non-null object
url                        1299 non-null object
already_enrolled           1299 non-null int64
recent_views               1299 non-null int64
recent_views_conversion    1299 non-null float64
hours_to_complete          1298 non-null float64
level_range                1299 non-null float64
key_words                  1299 non-null object
new_course_id              12

In [8]:
data = data.drop(['recommendations'], axis=1)
data = data.drop(['hours_to_complete'], axis=1) 
#data.hours_to_complete.dropna(axis=1, how='any') how='any' означает "удалить столбец, если хотя бы одно значение пусто"

In [9]:
#data['data'] = data.hours_to_complete.fillna(value = 0.0)
#data['hours_to_complete'].value_counts()
#data = data.loc[(data['hours_to_complete'] != 'Nan')]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 17 columns):
course_id                  1299 non-null object
title                      1299 non-null object
topics                     1299 non-null object
about                      1299 non-null object
instructors                1299 non-null object
average_score              1299 non-null float64
ratings_count              1299 non-null int64
reviews_count              1299 non-null int64
skills                     1299 non-null object
syllabus                   1299 non-null object
url                        1299 non-null object
already_enrolled           1299 non-null int64
recent_views               1299 non-null int64
recent_views_conversion    1299 non-null float64
level_range                1299 non-null float64
key_words                  1299 non-null object
new_course_id              1299 non-null int64
dtypes: float64(3), int64(5), object(9)
memory usage: 172.6+ KB


In [11]:
data_tfidf = data[['new_course_id','average_score',
                 'ratings_count','reviews_count','already_enrolled','recent_views',
                 'recent_views_conversion','level_range','key_words']]
data_tfidf.head()

Unnamed: 0,new_course_id,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,level_range,key_words
0,0,4.4,33,33,16728,5149,324.9,0.0,Business Business Essentials
1,1,4.5,92,92,11308,8209,137.8,0.1,Business Entrepreneurship
2,2,4.7,221,221,20657,10843,190.5,0.1,Business Entrepreneurship Materials Product De...
3,3,4.5,59,59,13266,20828,63.7,0.1,Business Entrepreneurship
4,4,4.7,92,92,41565,12090,343.8,0.0,Health Basic Science


#### Токенизация и очистка данных
Сделаем токенизацию слов из текстов key_words

In [12]:
mystopwords = stopwords.words('english') + ["i'm", '-', "i've"] 
regex = re.compile("['A-Za-z\-]+")

def tokenize(text, regex=regex, stopwords=mystopwords):
    """ Tokenize all tokens from text string
        Returns array of tokens
    """
    try:
        text = " ".join(regex.findall(text)).lower()
        tokens = ' '.join([token for token in text.split(' ') if not token in stopwords])
        return tokens
    except:
        return []

In [13]:
data_tfidf['key_words_tokenize'] = data_tfidf.key_words.apply(tokenize)
data_key_words_tokenize = data_tfidf.copy(deep=True)
data_tfidf.tail(10)

Unnamed: 0,new_course_id,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,level_range,key_words,key_words_tokenize
1289,1289,4.7,35,35,22176,3430,646.5,0.1,Arts and Humanities Philosophy,arts humanities philosophy
1290,1290,4.8,55,55,49412,21073,234.5,0.1,Language Learning Learning English,language learning learning english
1291,1291,4.8,299,299,30741,44009,69.9,0.1,Business Business Essentials Grammar Communica...,business business essentials grammar communica...
1292,1292,4.7,326,326,61688,16422,375.6,0.0,Language Learning Learning English,language learning learning english
1293,1293,4.7,64,64,17254,5698,302.8,0.0,Social Sciences Governance and Society,social sciences governance society
1294,1294,4.7,28,28,16553,3346,494.7,0.0,Personal Development Personal Development,personal development personal development
1295,1295,4.8,15,15,5532,5976,92.6,0.1,Health Public Health,health public health
1296,1296,4.8,70,70,8948,11144,80.3,0.1,Personal Development Personal Development Trai...,personal development personal development trai...
1297,1297,4.7,37,37,15334,2140,716.5,0.0,Arts and Humanities Philosophy,arts humanities philosophy
1298,1298,0.0,0,0,3968,0,0.0,0.5,Health Public Health,health public health


In [121]:
len(data.new_course_id)

1299

In [14]:
#data_t_s_1 = data_t_s.head(1000)
#data_t_s_tok = data_t_s_1
#data_t_s_tok['key_words_tokenize']

In [15]:
key_words = []
for i in tqdm_notebook(data_tfidf.key_words_tokenize.str.split(' ')):
        for j in i :
            key_words.append(j)

HBox(children=(IntProgress(value=0, max=1299), HTML(value='')))




In [16]:
dict_key_words_idf = {i:np.log(len(data_tfidf)/key_words.count(i)) for i in key_words}
dict_key_words_idf

{'business': 0.9408390130794164,
 'essentials': 3.10890700612418,
 'entrepreneurship': 3.09181257276488,
 'materials': 5.559912104236499,
 'product': 3.5858310782144898,
 'development': 1.7356280131163597,
 'new': 6.476202836110654,
 'human': 4.771454743872229,
 'computer': 1.7848549538815106,
 'interaction': 5.783055655550709,
 'health': 1.78945266313014,
 'basic': 3.2775297185599728,
 'science': 0.92907417149983,
 'arts': 2.213522959069339,
 'humanities': 2.2277075940612954,
 'music': 2.3818582738885534,
 'art': 2.534621028440964,
 'audio': 5.377590547442544,
 'recording': 5.377590547442544,
 'file': 7.1693500166706,
 'management': 1.5709280576722247,
 'mixing': 6.476202836110654,
 'abelton': 7.1693500166706,
 'live': 6.476202836110654,
 'midi': 6.476202836110654,
 'programming': 2.1654037107251405,
 'social': 2.02768646016794,
 'sciences': 2.26407523823217,
 'education': 3.1262987488360494,
 'language': 2.515389666513076,
 'learning': 2.1587147225743437,
 'english': 2.94984231149449

In [17]:
len(sorted(dict_key_words_idf.items(), key=lambda kv: kv[1]) )

1080

In [18]:
for i in dict_key_words_idf:
    data_tfidf['tf_idf_'+i] = data_tfidf.apply(lambda row: (1/len(row['key_words'].split(' ')))*dict_key_words_idf[i]if i in row['key_words'] else 0, axis=1)

In [19]:
data_tfidf.tail()

Unnamed: 0,new_course_id,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,level_range,key_words,key_words_tokenize,...,tf_idf_diversification,tf_idf_deception,tf_idf_exit,tf_idf_discovery-driven,tf_idf_solver,tf_idf_choosing,tf_idf_advisors,tf_idf_wind,tf_idf_task,tf_idf_training
1294,1294,4.7,28,28,16553,3346,494.7,0.0,Personal Development Personal Development,personal development personal development,...,0,0,0,0,0,0,0,0,0,0
1295,1295,4.8,15,15,5532,5976,92.6,0.1,Health Public Health,health public health,...,0,0,0,0,0,0,0,0,0,0
1296,1296,4.8,70,70,8948,11144,80.3,0.1,Personal Development Personal Development Trai...,personal development personal development trai...,...,0,0,0,0,0,0,0,0,0,0
1297,1297,4.7,37,37,15334,2140,716.5,0.0,Arts and Humanities Philosophy,arts humanities philosophy,...,0,0,0,0,0,0,0,0,0,0
1298,1298,0.0,0,0,3968,0,0.0,0.5,Health Public Health,health public health,...,0,0,0,0,0,0,0,0,0,0


In [20]:
data_tfidf.tf_idf_new.value_counts()

0.000000    1298
0.462586       1
Name: tf_idf_new, dtype: int64

In [21]:
data_tfidf = data_tfidf.drop(['key_words'], axis=1)
data_tfidf = data_tfidf.drop(['key_words_tokenize'], axis=1)

data_tfidf.tail(5)

Unnamed: 0,new_course_id,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,level_range,tf_idf_business,tf_idf_essentials,...,tf_idf_diversification,tf_idf_deception,tf_idf_exit,tf_idf_discovery-driven,tf_idf_solver,tf_idf_choosing,tf_idf_advisors,tf_idf_wind,tf_idf_task,tf_idf_training
1294,1294,4.7,28,28,16553,3346,494.7,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1295,1295,4.8,15,15,5532,5976,92.6,0.1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1296,1296,4.8,70,70,8948,11144,80.3,0.1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1297,1297,4.7,37,37,15334,2140,716.5,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1298,1298,0.0,0,0,3968,0,0.0,0.5,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
data_tfidf.describe()

Unnamed: 0,new_course_id,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,level_range,tf_idf_business,tf_idf_essentials,...,tf_idf_diversification,tf_idf_deception,tf_idf_exit,tf_idf_discovery-driven,tf_idf_solver,tf_idf_choosing,tf_idf_advisors,tf_idf_wind,tf_idf_task,tf_idf_training
count,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,...,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0
mean,649.0,4.443649,125.007698,125.007698,37073.15,36790.99,170.905158,0.110624,0.000122,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,375.13331,0.885474,163.022737,163.022737,97000.95,250752.1,155.363305,0.193248,0.003107,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,324.5,4.5,26.0,26.0,7878.0,5000.5,82.95,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,649.0,4.6,61.0,61.0,16627.0,11262.0,133.4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,973.5,4.7,150.0,150.0,37149.0,27053.5,218.55,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1298.0,5.0,965.0,965.0,2347044.0,8560147.0,1946.9,0.9,0.085531,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# сохраним data_tfidf  
#data_tfidf.to_csv('data_tfidf.csv') #выполняется 1 раз

In [24]:
#разделим обучающую и тестовую выборки
from sklearn.model_selection import train_test_split

In [25]:
X = data_tfidf.drop(columns=['average_score'])
y = data_tfidf['average_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [26]:
result = pd.DataFrame(y_test.reset_index(drop=True), columns=['target'])

## Линейная регрессия для предсказания оценки пользователя
Используем линейную регрессию в качестве бейзлайна, чтобы предсказать переменную - оценку пользователей (rating)

In [27]:
from sklearn.linear_model import LinearRegression # метод наименьших квадратов
from sklearn.metrics import mean_squared_log_error, mean_squared_error

In [28]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [29]:
predictions = model.predict(X_test)

In [30]:
mean_squared_error(predictions, y_test)

1.1570010308340617

In [31]:
X_test['predictions'] = predictions

In [32]:
X_test['LinearRegression_predictions'] = X_test['predictions']
models_res = X_test['LinearRegression_predictions']
result['LinearRegression'] = models_res #для визуализации

In [33]:
results = X_test.merge(data_tfidf, how='left', on='new_course_id')[
                                                        [ 'new_course_id' ,'average_score', 'predictions']]
results.sort_values('predictions', ascending=False).head()

Unnamed: 0,new_course_id,average_score,predictions
20,875,4.7,5.450157
95,231,4.7,5.347368
66,739,4.8,5.162371
128,86,4.7,5.152086
273,889,4.7,5.058907


#### Предскажем оценки пользователей используя RandomForestRegressor

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
X = data_tfidf.drop(columns=['average_score'])
y = data_tfidf['average_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [36]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [37]:
predictions = model.predict(X_test)
X_test['RandomForestRegressor_predictions'] = predictions
models_res.RandomForestRegressor_predictions = X_test['RandomForestRegressor_predictions']

print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, predictions)))

root_mean_squared_error =  0.33600902123754156


In [100]:
X_test.shape

(390, 1077)

In [38]:
print(model.feature_importances_)

[0.00754635 0.24816034 0.24103883 ... 0.         0.         0.        ]


In [39]:
results = X_test.merge(data_tfidf, how='left', on='new_course_id')[
                                                        [ 'new_course_id' ,'average_score', 'RandomForestRegressor_predictions']]
results.sort_values('RandomForestRegressor_predictions', ascending=False).head()

Unnamed: 0,new_course_id,average_score,RandomForestRegressor_predictions
363,355,4.6,4.84
272,1018,4.4,4.83
269,1183,4.9,4.83
294,718,4.9,4.83
171,998,4.9,4.82


#### Попробуем применить к исходным key_words CountVectorizer и TfidfTransformer

In [60]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [61]:
cv_data_tfidf = CountVectorizer()
tf_data_tfidf = TfidfTransformer()

In [62]:
data_tfidf = cv_data_tfidf.fit_transform(data_key_words_tokenize['key_words_tokenize'])

In [63]:
cv_data_tfidf.get_feature_names()

['abelton',
 'abstract',
 'abuse',
 'academic',
 'accessibility',
 'accounting',
 'accrual',
 'acquisitions',
 'active',
 'activity',
 'acute',
 'addiction',
 'adhd',
 'adjective',
 'adobe',
 'advertising',
 'advisors',
 'agile',
 'ai',
 'aid',
 'alcohol',
 'algebra',
 'algorithm',
 'algorithmic',
 'algorithms',
 'analysis',
 'analytics',
 'ancient',
 'android',
 'angularjs',
 'animal',
 'announcement',
 'anova',
 'anthropology',
 'antimicrobial',
 'apache',
 'api',
 'app',
 'application',
 'applications',
 'applied',
 'apposition',
 'appraisal',
 'appreciative',
 'arbitration',
 'architectural',
 'architecture',
 'archiving',
 'arduino',
 'array',
 'art',
 'artificial',
 'arts',
 'asd',
 'assembly',
 'assertiveness',
 'asset',
 'astrobiology',
 'astronomy',
 'attack',
 'attacks',
 'attention',
 'audio',
 'aurally',
 'authentication',
 'autism',
 'automated',
 'backlog',
 'backward',
 'bacteriology',
 'balance',
 'base',
 'based',
 'basic',
 'bayes',
 'bayesian',
 'behavior',
 'behavio

In [64]:
len(cv_data_tfidf.get_feature_names())

1077

In [65]:
tfidf_data_tfidf= tf_data_tfidf.fit_transform(data_tfidf)

In [66]:
tfidf_data_tfidf

<1299x1077 sparse matrix of type '<class 'numpy.float64'>'
	with 7742 stored elements in Compressed Sparse Row format>

In [67]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_data_tfidf, y, test_size=0.3,random_state=42)

In [68]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [120]:
X_train.shape

(909, 1077)

In [69]:
print(model.feature_importances_)

[0.00000000e+00 0.00000000e+00 4.56043984e-06 ... 0.00000000e+00
 0.00000000e+00 4.03632155e-06]


In [72]:
predictions1 = model.predict(X_test)

print('root_mean_squared_error = ', np.sqrt(mean_squared_error(predictions1, y_test)))

root_mean_squared_error =  0.9945978730493141


#### Попробуем применить GridSearch к моделям

In [74]:
from sklearn.model_selection import GridSearchCV

In [75]:
%%time
lr_params = {
    'fit_intercept':[False, True]
}

lr = LinearRegression()
grid_lr = GridSearchCV(lr, lr_params,
                       scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_lr.fit(X_train, y_train)

print(grid_lr.best_params_)
print(grid_lr.best_score_)
print(grid_lr.best_estimator_)

{'fit_intercept': True}
-1.0557599913099016
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
CPU times: user 287 ms, sys: 66.5 ms, total: 353 ms
Wall time: 3.7 s


In [76]:
#Оценим тчоностьна тесте
grid_lr.score(X_test,y_test)

-13.108054129026524

In [77]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_lr.best_estimator_.predict(X_test))))

root_mean_squared_error =  3.62050467877429


In [78]:
np.sqrt(-grid_lr.score(X_test,y_test))

3.62050467877429

#### Предскажем оценки пользователей используя KNeighborsRegressor

In [79]:
from sklearn.model_selection import RandomizedSearchCV

In [81]:
from sklearn.neighbors import KNeighborsRegressor

In [82]:
%%time

knn_params = {
    'n_neighbors':list(range(1, 30))
   ,'weights': ['uniform', 'distance']
   ,'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
   ,'leaf_size':list(range(5, 30))
}

grid_knn = RandomizedSearchCV(KNeighborsRegressor(), knn_params,  scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_knn.fit(X_train, y_train)
print(grid_knn.best_params_)
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

{'weights': 'uniform', 'n_neighbors': 27, 'leaf_size': 23, 'algorithm': 'kd_tree'}
-0.7433624044828675
KNeighborsRegressor(algorithm='kd_tree', leaf_size=23, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=27, p=2,
          weights='uniform')
CPU times: user 110 ms, sys: 94.6 ms, total: 205 ms
Wall time: 3.32 s




In [89]:
predictions2 = grid_knn.best_estimator_.predict(X_test)
predictions2

array([4.64074066, 4.6518519 , 4.27037038, 4.09999992, 4.33703702,
       4.42592589, 4.64444436, 4.26296296, 4.337037  , 4.44444441,
       4.05555555, 4.13333334, 4.20000002, 4.47777777, 4.65185181,
       4.11111104, 4.17037038, 4.23333331, 4.30000001, 4.52962954,
       4.47407405, 4.48148152, 4.62962961, 4.44814818, 4.33703702,
       4.71111116, 4.3037037 , 4.22962962, 4.15925923, 4.66296291,
       4.45555554, 4.35555555, 4.10740734, 4.17407408, 4.34444442,
       4.26296296, 4.10740734, 4.1814815 , 4.68148145, 4.23703701,
       4.47037035, 4.42592596, 4.07407405, 4.40740741, 4.62962959,
       4.44444448, 4.56666659, 4.38888882, 4.15925923, 4.34074072,
       3.8259259 , 4.44444444, 4.44074081, 4.14814817, 4.48518516,
       3.90740738, 4.15555557, 4.43703708, 4.29629629, 4.53703699,
       4.49259258, 4.09259257, 4.25555554, 4.70370376, 4.11111104,
       4.15555557, 4.30740741, 4.55185187, 4.28518517, 4.20000002,
       4.43703699, 4.55555548, 4.62962959, 4.48518522, 4.69999

In [83]:
grid_knn.score(X_test,y_test)

-0.9595446497387168

In [84]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_knn.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.9795634995949557


#### Предскажем оценки пользователей используя DecisionTreeRegressor

In [102]:
from sklearn.tree import DecisionTreeRegressor

In [103]:
%%time
dt_params = {
    'max_depth':[None,1,2,5,10,25,50],
     'min_samples_split':[2,5,8,10,25,50],
    
    'min_weight_fraction_leaf': [0, 0.01, 0.1, 0.15, 0.25, 0.5] ,
    'min_samples_leaf':list(range(1, 10)),
     'criterion':  ['mse', 'friedman_mse', 'mae'],
    'max_features':list(range(1, 13)) }    

grid_dt =  RandomizedSearchCV(DecisionTreeRegressor(),dt_params,  scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_dt.fit(X_train, y_train)
print(grid_dt.best_params_)
print(grid_dt.best_score_)
print(grid_dt.best_estimator_)

{'min_weight_fraction_leaf': 0.5, 'min_samples_split': 25, 'min_samples_leaf': 1, 'max_features': 1, 'max_depth': 50, 'criterion': 'mse'}
-0.7027603165377755
DecisionTreeRegressor(criterion='mse', max_depth=50, max_features=1,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=25, min_weight_fraction_leaf=0.5,
           presort=False, random_state=None, splitter='best')
CPU times: user 95.8 ms, sys: 64 ms, total: 160 ms
Wall time: 3.13 s


In [104]:
grid_dt.score(X_test,y_test)

-0.9744468953149691

In [105]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_dt.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.9871407677302002


#### Предскажем оценки пользователей используя RandomForestRegressor

In [106]:
%%time
rf_params = {
    'n_estimators': [1,10,20,30,40,50,60,80,90],
    
    'max_depth':[None,1,2,5,10,25,50],
    
    'min_samples_leaf':list(range(1, 10)),
   
    'max_features':list(range(1, 13)) ,
     'criterion':  ['mse', 'friedman_mse', 'mae']}
    

grid_rf =  RandomizedSearchCV(RandomForestRegressor(),rf_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_rf.fit(X_train, y_train)
print(grid_rf.best_params_)
print(grid_rf.best_score_)
print(grid_rf.best_estimator_)

{'n_estimators': 10, 'min_samples_leaf': 9, 'max_features': 8, 'max_depth': None, 'criterion': 'friedman_mse'}
-0.7009084907206076
RandomForestRegressor(bootstrap=True, criterion='friedman_mse',
           max_depth=None, max_features=8, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=9, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
CPU times: user 71.4 ms, sys: 18.7 ms, total: 90.1 ms
Wall time: 521 ms


In [107]:
grid_rf.score(X_test,y_test)

-0.9680490431858803

In [108]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_rf.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.9838948333972896


#### Предскажем оценки пользователей используя GradientBoostingRegressor

In [109]:
%%time
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
gb_params = {
          'n_estimators': [1,10,20,30,40,50,60,80,90],
          'max_features': list(range(1, 13)),
             'max_depth':[None,1,2,5,10,25,50],
            'learning_rate': [0.1,0.3,0.5,0.7],
               #'min_samples_split':[2,5,8,10,25,50],
            'min_samples_leaf':list(range(1, 10)),}
    
grid_gb =  RandomizedSearchCV(GradientBoostingRegressor(),gb_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_gb.fit(X_train, y_train)
print(grid_gb.best_params_)
print(grid_gb.best_score_)
print(grid_gb.best_estimator_)

{'n_estimators': 90, 'min_samples_leaf': 7, 'max_features': 11, 'max_depth': None, 'learning_rate': 0.1}
-0.6946765753222929
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=None, max_features=11,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=7,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=90, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)
CPU times: user 95.8 ms, sys: 12 ms, total: 108 ms
Wall time: 965 ms


In [110]:
grid_gb.score(X_test,y_test)

-0.9074092804486722

In [111]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_gb.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.9525803275570371


#### Предскажем оценки пользователей используя SVR

In [112]:
from sklearn.svm import SVR

In [113]:
%%time
SVR_params = {
          #'kernel':['linear', 'poly','rbf', 'sigmoid', 'precomputed'],
             'C' :[0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]}

grid_SVR =   GridSearchCV(SVR(),SVR_params,scoring='neg_mean_squared_error',
                       cv=3,n_jobs=-1)
grid_SVR.fit(X_train, y_train)
print(grid_SVR.best_params_)
print(grid_SVR.best_score_)
print(grid_SVR.best_estimator_)

{'C': 1, 'gamma': 1}
-0.7101656371832971
SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
CPU times: user 130 ms, sys: 25.3 ms, total: 156 ms
Wall time: 1.04 s


In [114]:
grid_SVR.score(X_test,y_test)

-0.9679792679831994

In [115]:
print('root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, grid_SVR.best_estimator_.predict(X_test))))

root_mean_squared_error =  0.9838593740892035


In [116]:
models=['SVR','gb','dt','rf','knn','lr']

In [117]:
for model_ in models:
    print(model_,'---','root_mean_squared_error = ', np.sqrt(mean_squared_error(y_test, eval(
                                                        'grid_'+model_+'.best_estimator_').predict(X_test))))


SVR --- root_mean_squared_error =  0.9838593740892035
gb --- root_mean_squared_error =  0.9525803275570371
dt --- root_mean_squared_error =  0.9871407677302002
rf --- root_mean_squared_error =  0.9838948333972896
knn --- root_mean_squared_error =  0.9795634995949557
lr --- root_mean_squared_error =  3.62050467877429


In [None]:
#видим, что нилучший результат у модели SVR