In [10]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from tqdm import tqdm_notebook

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score

import seaborn as sns

import multiprocessing

import re

from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

%matplotlib inline

In [11]:
courses_eng = pd.read_csv('courses_eng_1.csv')
courses_eng['skills'] = courses_eng.skills.fillna(value = '')
courses_eng=courses_eng.drop(['Unnamed: 0'], axis=1)

courses_eng.head(2)

Unnamed: 0,course_id,title,topics,about,instructors,average_score,ratings_count,reviews_count,skills,syllabus,recommendations,url,already_enrolled,recent_views,recent_views_conversion,hours_to_complete,level_range
0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,4.4,33,33,,Introduction IT and the CIO in the Digital Wor...,fundamentals-of-management entrepreneurial-thi...,https://www.coursera.org/learn/2-speed-it,16728,5149,324.9,21.0,0.0
1,3d-printing-applications,3D Printing Applications,Business Entrepreneurship,This course will help you understand how 3D pr...,Vishal Sachdev,4.5,92,92,,Course Orientation Module 1: 3D Printing – A N...,mechanics2 beam-bending,https://www.coursera.org/learn/3d-printing-app...,11308,8209,137.8,17.0,0.1


In [12]:
courses_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 17 columns):
course_id                  1299 non-null object
title                      1299 non-null object
topics                     1299 non-null object
about                      1299 non-null object
instructors                1299 non-null object
average_score              1299 non-null float64
ratings_count              1299 non-null int64
reviews_count              1299 non-null int64
skills                     1299 non-null object
syllabus                   1299 non-null object
recommendations            1259 non-null object
url                        1299 non-null object
already_enrolled           1299 non-null int64
recent_views               1299 non-null int64
recent_views_conversion    1299 non-null float64
hours_to_complete          1298 non-null float64
level_range                1299 non-null float64
dtypes: float64(4), int64(4), object(9)
memory usage: 172.6+ KB


In [13]:
courses_eng=courses_eng.drop(['average_score'], axis=1)
courses_eng=courses_eng.drop(['ratings_count'], axis=1)
courses_eng=courses_eng.drop(['reviews_count'], axis=1)
courses_eng=courses_eng.drop(['url'], axis=1)
courses_eng=courses_eng.drop(['already_enrolled'], axis=1)
courses_eng=courses_eng.drop(['recommendations'], axis=1)
courses_eng=courses_eng.drop(['recent_views'], axis=1)
courses_eng=courses_eng.drop(['recent_views_conversion'], axis=1)
courses_eng=courses_eng.drop(['hours_to_complete'], axis=1)
courses_eng=courses_eng.drop(['level_range'], axis=1)


In [14]:
courses_eng

Unnamed: 0,course_id,title,topics,about,instructors,skills,syllabus
0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I...",Antoine Gourévitch Vanessa Lyon Eric Baudson,,Introduction IT and the CIO in the Digital Wor...
1,3d-printing-applications,3D Printing Applications,Business Entrepreneurship,This course will help you understand how 3D pr...,Vishal Sachdev,,Course Orientation Module 1: 3D Printing – A N...
2,3d-printing-revolution,The 3D Printing Revolution,Business Entrepreneurship,This course will demonstrate how 3D printers w...,Aric Rindfleisch,Materials Product Development New Product Deve...,Course Orientation Module 1: What Is 3D Printi...
3,3d-printing-software,3D Printing Software,Business Entrepreneurship,This course will demonstrate how to use 3D pri...,Jeffrey Smith,,Course Orientation Module 1: Design Sketching ...
4,abdomen-anatomy,Anatomy of the Abdomen and Pelvis; a journey f...,Health Basic Science,In this anatomy course you will explore the or...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,,Introduction Mapping the abdomen and pelvis Tr...
5,ableton-live,Introduction to Ableton Live,Arts and Humanities Music and Art,"In the past, Digital Audio Workstations (DAWs)...",Erin Barra,Audio Recording File Management Music mixing A...,Welcome to Introduction to Ableton Live Gettin...
6,aboriginal-education,Aboriginal Worldviews and Education,Social Sciences Education,Intended for both Aboriginal and non-Aborigina...,Jean-Paul Restoule,,Welcome and Orientation to the Course Aborigin...
7,academic-discussion-english,Academic Discussions in English,Language Learning Learning English,This is the third course in the Learn English:...,Tamy Chapman Meg Parker Helen Nam Brad Gilpin,,Introduction to Discussions Elaboration Intera...
8,academic-literacy,Academic Literacy,Language Learning Learning English,This is the first course of the English for Re...,Elena Bazanova,,MODULE I: THE READING–RESEARCH–WRITING CONTINU...
9,academic-writing-capstone,Project: Writing a Research Paper,Language Learning Learning English,Welcome to the capstone project for the Academ...,Tamy Chapman Helen Nam Brad Gilpin,,Getting Started Research and Annotated Bibliog...


In [15]:
#объединим topics & skills в новую фичу key_words
courses_eng['key_words'] = courses_eng.topics.map(str) + ' ' + courses_eng.skills
#объединим about & syllabus в новую фичу course_descr
courses_eng['course_descr'] = courses_eng.about.map(str) + ' ' + courses_eng.syllabus
courses_eng=courses_eng.drop(['topics'], axis=1)
courses_eng=courses_eng.drop(['skills'], axis=1)
courses_eng=courses_eng.drop(['about'], axis=1)
courses_eng=courses_eng.drop(['syllabus'], axis=1)
courses_eng.head(2)

Unnamed: 0,course_id,title,instructors,key_words,course_descr
0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,Antoine Gourévitch Vanessa Lyon Eric Baudson,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I..."
1,3d-printing-applications,3D Printing Applications,Vishal Sachdev,Business Entrepreneurship,This course will help you understand how 3D pr...


In [16]:
courses_eng_descr_text = courses_eng.copy(deep=True)
courses_eng_descr_text

Unnamed: 0,course_id,title,instructors,key_words,course_descr
0,2-speed-it,Two Speed IT: How Companies Can Surf the Digit...,Antoine Gourévitch Vanessa Lyon Eric Baudson,Business Business Essentials,"Transform or disappear, the Darwinism of IT: I..."
1,3d-printing-applications,3D Printing Applications,Vishal Sachdev,Business Entrepreneurship,This course will help you understand how 3D pr...
2,3d-printing-revolution,The 3D Printing Revolution,Aric Rindfleisch,Business Entrepreneurship Materials Product De...,This course will demonstrate how 3D printers w...
3,3d-printing-software,3D Printing Software,Jeffrey Smith,Business Entrepreneurship,This course will demonstrate how to use 3D pri...
4,abdomen-anatomy,Anatomy of the Abdomen and Pelvis; a journey f...,Marco De Ruiter PhD Paul Gobée MD Beerend P. H...,Health Basic Science,In this anatomy course you will explore the or...
5,ableton-live,Introduction to Ableton Live,Erin Barra,Arts and Humanities Music and Art Audio Record...,"In the past, Digital Audio Workstations (DAWs)..."
6,aboriginal-education,Aboriginal Worldviews and Education,Jean-Paul Restoule,Social Sciences Education,Intended for both Aboriginal and non-Aborigina...
7,academic-discussion-english,Academic Discussions in English,Tamy Chapman Meg Parker Helen Nam Brad Gilpin,Language Learning Learning English,This is the third course in the Learn English:...
8,academic-literacy,Academic Literacy,Elena Bazanova,Language Learning Learning English,This is the first course of the English for Re...
9,academic-writing-capstone,Project: Writing a Research Paper,Tamy Chapman Helen Nam Brad Gilpin,Language Learning Learning English,Welcome to the capstone project for the Academ...


In [17]:
courses_eng_descr_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299 entries, 0 to 1298
Data columns (total 5 columns):
course_id       1299 non-null object
title           1299 non-null object
instructors     1299 non-null object
key_words       1299 non-null object
course_descr    1299 non-null object
dtypes: object(5)
memory usage: 50.8+ KB


In [18]:
# сохраним текстовое описание курса в отдельный файл   
courses_eng_descr_text.to_csv('courses_eng_descr_text.csv')