In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score

import seaborn as sns

import multiprocessing

import re

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

%matplotlib inline

In [2]:
# Oбъединённый датафрейм описания англоязычных курсов с отзывами пользователей 
data = pd.read_csv('data.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,average_score,...,reviews_count,skills,syllabus,recommendations,url,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range
0,0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,...,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0
1,1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,...,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0


In [3]:
#избавимся от лишних столбцов
data=data.drop(['Unnamed: 0'], axis=1)
#заменим NaN
data['skills'] = data.skills.fillna(value = '')
data.head(2)

Unnamed: 0,course_id,reviewer_name,rating,review_text,title,topics,about,instructors,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range
0,2-speed-it,Ravish,5,Very relevant and useful course designed for CIOs,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0
1,2-speed-it,Etienne R,2,This course does not say anything about digiti...,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0


In [4]:
#data=data.drop(['recommendations'], axis=1)
#data=data.drop(['url'], axis=1)

#### Токенизация и очистка данных

In [5]:
import pymorphy2
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm
from pymorphy2 import MorphAnalyzer
from pymystem3 import Mystem

In [6]:
class MorphProvider:
    def __init__(self):
        self.cache = {}
        self.morph = pymorphy2.MorphAnalyzer()
    
    def __call__(self, w):
        w = w.lower()
        cached = self.cache.get(w)
        if cached:
            return cached
        try:
            morphed = self.morph.parse(w)[0].normal_form
            self.cache[w] = morphed
            return morphed
        except:
            return None
    
    def morph_string(self, s):
        words = word_tokenize(s)
        return " ".join([self.__call__(w) for w in words])

In [7]:
morph = MorphProvider()
#  нормализовать все тексты во всех колонках

In [8]:
data["title_normalized"] = [morph.morph_string(t) for t in tqdm(data.title)]

100%|██████████| 159253/159253 [00:16<00:00, 9846.92it/s] 


In [9]:
data["about_normalized"] = [morph.morph_string(t) for t in tqdm(data.about)]


100%|██████████| 159253/159253 [03:52<00:00, 683.90it/s] 


In [10]:
data["topics_normalized"] = [morph.morph_string(t) for t in tqdm(data.topics)]


100%|██████████| 159253/159253 [00:15<00:00, 10207.87it/s]


In [11]:
data["syllabus_normalized"] = [morph.morph_string(t) for t in tqdm(data.syllabus)] 


100%|██████████| 159253/159253 [00:32<00:00, 4845.57it/s]


In [12]:
data['review_text'] = data['review_text'].astype(str)

data["review_text_normalized"] = [morph.morph_string(t) for t in tqdm(data.review_text)]


100%|██████████| 159253/159253 [00:45<00:00, 3531.90it/s]


In [13]:
data['skills'] = data['skills'].astype(str)

data["skills_normalized"] = [morph.morph_string(t) for t in tqdm(data.skills)]


100%|██████████| 159253/159253 [00:15<00:00, 10308.21it/s]


In [14]:
data['recommendations'] = data['recommendations'].astype(str)

data["recommendations_normalized"] = [morph.morph_string(t) for t in tqdm(data.recommendations)] #использовать как доп фичи или как учителя???


100%|██████████| 159253/159253 [00:18<00:00, 8646.28it/s] 


In [15]:
#очистим от лишних символов текст 

import re
regex = re.compile("['A-Za-z\-]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""
    
#применим функцию - очистим от лишних символов текст
data["title_normalized"] = [words_only(t) for t in tqdm(data.title_normalized)] 


100%|██████████| 159253/159253 [00:00<00:00, 343731.65it/s]


In [16]:
data["about_normalized"] = [words_only(t) for t in tqdm(data.about_normalized)] 

100%|██████████| 159253/159253 [00:08<00:00, 18694.92it/s]


In [17]:
data["topics_normalized"] = [words_only(t) for t in tqdm(data.topics_normalized)] 

100%|██████████| 159253/159253 [00:00<00:00, 452078.82it/s]


In [18]:
data["syllabus_normalized"] = [words_only(t) for t in tqdm(data.syllabus_normalized)] 

100%|██████████| 159253/159253 [00:01<00:00, 131015.93it/s]


In [19]:
data["review_text_normalized"] = [words_only(t) for t in tqdm(data.review_text_normalized)] 

100%|██████████| 159253/159253 [00:01<00:00, 124715.52it/s]


In [20]:
data["skills_normalized"] = [words_only(t) for t in tqdm(data.skills_normalized)] 

100%|██████████| 159253/159253 [00:00<00:00, 397910.41it/s]


In [21]:
data["recommendations_normalized"] = [words_only(t) for t in tqdm(data.recommendations_normalized)] 

100%|██████████| 159253/159253 [00:00<00:00, 635329.42it/s]


In [22]:
data_normalized = data[['course_id','reviewer_name','rating',
                       'instructors','average_score','ratings_count','reviews_count',
                       'already_enrolled','recent_views','recent_views_conversion',
                       'hours_to_complete','level_range','title_normalized','about_normalized',
                       'topics_normalized','syllabus_normalized','review_text_normalized','skills_normalized',
                       'recommendations_normalized']]
data_normalized.head()

Unnamed: 0,course_id,reviewer_name,rating,instructors,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range,title_normalized,about_normalized,topics_normalized,syllabus_normalized,review_text_normalized,skills_normalized,recommendations_normalized
0,2-speed-it,Ravish,5,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed it how companies can surf the digita...,transform or disappear the darwinism of it in ...,business business essentials,introduction it and the cio in the digital wor...,very relevant and useful course designed for cios,,fundamentals-of-management entrepreneurial-thi...
1,2-speed-it,Etienne R,2,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed it how companies can surf the digita...,transform or disappear the darwinism of it in ...,business business essentials,introduction it and the cio in the digital wor...,this course does not say anything about digiti...,,fundamentals-of-management entrepreneurial-thi...
2,2-speed-it,Viswas P,4,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed it how companies can surf the digita...,transform or disappear the darwinism of it in ...,business business essentials,introduction it and the cio in the digital wor...,videos that are presented in french could 've ...,,fundamentals-of-management entrepreneurial-thi...
3,2-speed-it,AN L,3,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed it how companies can surf the digita...,transform or disappear the darwinism of it in ...,business business essentials,introduction it and the cio in the digital wor...,the course content is quite good though it cou...,,fundamentals-of-management entrepreneurial-thi...
4,2-speed-it,Konstantin A,5,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed it how companies can surf the digita...,transform or disappear the darwinism of it in ...,business business essentials,introduction it and the cio in the digital wor...,great piece of work i especially liked a few '...,,fundamentals-of-management entrepreneurial-thi...


In [23]:
#Токенизация

In [24]:
import re

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

In [25]:
mystopwords = stopwords.words('english') + ["i'm", '-', "i've"] + ["\\", "\"", "'", "\'"] + ['many','get','one','way','courses','lot','much','could','really', 'would','also', 'course']
regex = re.compile("['A-Za-z\-]+")

def tokenize(text, regex=regex, stopwords=mystopwords):
    """ Tokenize all tokens from text string
        Returns array of tokens
    """
    try:
        text = " ".join(regex.findall(text)).lower()
        tokens = ' '.join([token for token in text.split(' ') if not token in stopwords])
        return tokens
    except:
        return []

In [26]:
data_normalized_tokenize = data_normalized[['course_id','reviewer_name','rating',
                       'instructors','average_score','ratings_count','reviews_count',
                       'already_enrolled','recent_views','recent_views_conversion',
                       'hours_to_complete','level_range']]

data_normalized_tokenize['title_normalized_tokenize'] = data_normalized.title_normalized.apply(tokenize)
data_normalized_tokenize['about_normalized_tokenize'] = data_normalized.about_normalized.apply(tokenize)
data_normalized_tokenize['topics_normalized_tokenize'] = data_normalized.topics_normalized.apply(tokenize)
data_normalized_tokenize['syllabus_normalized_tokenize'] = data_normalized.syllabus_normalized.apply(tokenize)
data_normalized_tokenize['review_text_normalized_tokenize'] = data_normalized.review_text_normalized.apply(tokenize)
data_normalized_tokenize['skills_normalized_tokenize'] = data_normalized.skills_normalized.apply(tokenize)
data_normalized_tokenize['recommendations_normalized_tokenize'] = data_normalized.recommendations_normalized.apply(tokenize)


data_normalized_tokenize.head()

Unnamed: 0,course_id,reviewer_name,rating,instructors,average_score,ratings_count,reviews_count,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range,title_normalized_tokenize,about_normalized_tokenize,topics_normalized_tokenize,syllabus_normalized_tokenize,review_text_normalized_tokenize,skills_normalized_tokenize,recommendations_normalized_tokenize
0,2-speed-it,Ravish,5,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed companies surf digital wave bcg pers...,transform disappear darwinism order adapt digi...,business business essentials,introduction cio digital world steer balance s...,relevant useful designed cios,,fundamentals-of-management entrepreneurial-thi...
1,2-speed-it,Etienne R,2,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed companies surf digital wave bcg pers...,transform disappear darwinism order adapt digi...,business business essentials,introduction cio digital world steer balance s...,say anything digitization core subject digital...,,fundamentals-of-management entrepreneurial-thi...
2,2-speed-it,Viswas P,4,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed companies surf digital wave bcg pers...,transform disappear darwinism order adapt digi...,business business essentials,introduction cio digital world steer balance s...,videos presented french 've translated english,,fundamentals-of-management entrepreneurial-thi...
3,2-speed-it,AN L,3,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed companies surf digital wave bcg pers...,transform disappear darwinism order adapt digi...,business business essentials,introduction cio digital world steer balance s...,content quite good though deeper areas peer re...,,fundamentals-of-management entrepreneurial-thi...
4,2-speed-it,Konstantin A,5,Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,16728,5149,324.9,21.0,0.0,two speed companies surf digital wave bcg pers...,transform disappear darwinism order adapt digi...,business business essentials,introduction cio digital world steer balance s...,great piece work especially liked 'lifehacks cio,,fundamentals-of-management entrepreneurial-thi...
