In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import time
import re
import seaborn as sns
import multiprocessing

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import xgboost
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from sklearn import utils
from sklearn.metrics import accuracy_score, f1_score, brier_score_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression

In [2]:
# датасет с описанием англоязычных курсов
courses_eng = pd.read_csv('courses_eng.csv')
courses_eng.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,course_id,title,topics,about,instructors,language,metrics,product_glance,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url
0,0,0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,"['Business', 'Business Essentials']","Transform or disappear, the Darwinism of IT: I...","['Antoine Gourévitch', 'Vanessa Lyon', 'Eric B...",English,"['16,728 already enrolled!', '5,149 recent vie...","['100% online', 'Flexible deadlines', 'Approx....",4.4,33,33,[],"['Introduction', 'IT and the CIO in the Digita...","['fundamentals-of-management', 'entrepreneuria...",https://www.coursera.org/learn/2-speed-it
1,5,5,3d-printing-applications,3D Printing Applications,"['Business', 'Entrepreneurship']",This course will help you understand how 3D pr...,['Vishal Sachdev'],English,"['11,308 already enrolled!', '8,209 recent vie...","['100% online', 'Flexible deadlines', 'Beginne...",4.5,92,92,[],"['Course Orientation', 'Module 1: 3D Printing ...","['mechanics2', 'beam-bending']",https://www.coursera.org/learn/3d-printing-app...


In [3]:
#датасет с отзывами по англоязычным курсам
reviews_eng_courses = pd.read_csv('reviews_eng_courses.csv')
reviews_eng_courses.head()

Unnamed: 0,url,course_id,reviewer_name,rating,review_text
0,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs
1,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Etienne R,2,This course does not say anything about digiti...
2,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Viswas P,4,Videos that are presented in French could've b...
3,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,AN L,3,"The course content is quite good, though it co..."
4,https://www.coursera.org/learn/2-speed-it/revi...,2-speed-it,Konstantin A,5,"Great piece of work, I especially liked a few ..."


Токенизация и очистка данных
Сделаем токенизацию слов из отзывов

In [None]:
mystopwords = stopwords.words('english') + ["i'm", '-', "i've"]
regex = re.compile("['A-Za-z\-]+")

def tokenize(text, regex=regex, stopwords=mystopwords):
    """ Tokenize all tokens from text string
        Returns array of tokens
    """
    try:
        text = " ".join(regex.findall(text)).lower()
        tokens = ' '.join([token for token in text.split(' ') if not token in stopwords])
        return tokens
    except:
        return []

In [None]:
reviews_eng_courses['text'] = reviews_eng_courses.review_text.apply(tokenize)

In [None]:
reviews_eng_courses.head()

In [None]:
### Определим частоту встречаемости слов в отзывах, построим облако слов для наглядности
from collections import Counter

lemmata = []
for index, row in reviews_eng_courses.iterrows():
    lemmata += row['text'].split()
cnt = Counter(lemmata)

for i in cnt.most_common(15):
    print(i)

In [None]:
# Количество слов в словаре:
print(len(cnt).format('')

In [None]:
from wordcloud import *
word_freq = [i for i in cnt.most_common(100)]
wd = WordCloud(background_color = 'white')
wd.generate_from_frequencies(dict(word_freq))
plt.figure()
plt.imshow(wd, interpolation = 'bilinear')
plt.axis('off')
plt.show()

## Сформируем сбалансированный датасет c обучающей и тестовой выборкой
Поскольку датасет несбалансирован, применим Undersampling подход

In [None]:
from imblearn.under_sampling import RandomUnderSampler
randUS = RandomUnderSampler(random_state=0)

X = reviews_eng_courses[['text']]
y = reviews_eng_courses['rating']

X_balanced, y_balanced = randUS.fit_resample(X, y)

In [None]:
balanced = pd.DataFrame.from_dict({'text': X_balanced[:,0], 'target': y_balanced}) 
balanced.head()

In [None]:
# разделим датасет обучающую и тестовую выборку
train, test = train_test_split(balanced, test_size=0.2, random_state=42)

### Построим векторную модель с помощью Doc2Vec


In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [None]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['text'].split(' '), tags=[r.target]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['text'].split(' '), tags=[r.target]), axis=1)

In [None]:
train_tagged.values[30]

In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=2000, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [None]:
# Обучаем модель
for epoch in range(10):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [None]:
# Сформируем итоговый набор векторов для обучения
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [None]:
## Логистическая регрессия

In [None]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))