In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk import word_tokenize
from sklearn.feature_extraction import text
import urllib.request, json 

In [2]:
from sklearn.cluster import KMeans, MiniBatchKMeans, MeanShift
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

In [3]:
from langdetect import detect

In [19]:
from scipy.spatial.distance import cdist
import matplotlib.pylab as plt
from sklearn import metrics
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

## Define helper functions, initialize vectorizer

In [5]:
def list_to_string(df):    
    return ' '.join(df['text'])

In [6]:
def extract_language(df):    
    df['language'] = detect(df['text_concat'])
    return df

In [7]:
def extract_json(df):    
    df['course_id'] = df.values[0]['course_id']
    df['title'] = df.values[0]['title']
    df['text_concat'] = df.values[0]['description'] + ' ' + df.values[0]['subtitle'] + ' ' + df.values[0]['what_you_will_learn']
    df['url'] = df.values[0]['course_about_uri']
    df['language'] = detect(df['text_concat'])
    return df

In [204]:
# define stop words and add them to the standard english list

additional_stop_words = [
    'university',
    'college',
    'partner',
    'closely',
    'readings',
    'reading',
    'requires',
    'require',
    'title',
    'provider',
    'section',
    'video',
    'videos',
    'problem',
    'problems',
    'learners',
    'define',
    'able',
    'exam',
    'duke',
    'week',
    'teach',
    'learn',
    'goes',
    'description',
    'school',
    'illinois',
    'urbana',
    'duke',
    'champaign',
    'mit',
    'stanford',
    'harvard',
    'look',
    'null',
    'li',
    'em',
    'ul',
    'edx',
    'harvardx',
    'span',
    'href',
    '_blank',
    'mitx',
    'likely',
    'use',
    'like',
    'northwestern',
    'mooc',
    'enroll',
    'enrolling',
    'course',
    'certificate',
    'verified',
    'california',
    'san diego',
    'michigan',
    'll',
    'xseries',
    'micromasters',
    'actual',
    'actually',
    'teacher',
    'teaches',
    'edu',
    'educators',
    'work',
    'effort',
    'font',
    'acca',
    'nbsp',
    'br',
    'using',
    'strong',
    'stuff',
    'quot',
    'skills',
    'new',
    'understand'
]

stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)

## edX
* * *

In [15]:
# with urllib.request.urlopen("https://www.edx.org/api/catalog/v2/courses/") as url:
#     edx_df = json.loads(url.read().decode())

edx_df = pd.read_json('../Documents/edx_courses.json')
edx_df = edx_df.apply(extract_json, axis=1)
edx_df = edx_df.set_index('title')

In [16]:
edx_df.head()

Unnamed: 0_level_0,items,course_id,text_concat,url,language
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Introduction to Marketing,"{'cid': 'v2_full_en:16781', 'current_language'...",course-v1:UBCx+Marketing1x+3T2015,<p>Marketing is a crucial function in all busi...,/course/introduction-marketing-ubcx-marketing1x,en
Introduction to Computer Science,"{'cid': 'v2_full_en:1022', 'current_language':...",course-v1:HarvardX+CS50+X,"<p>This is <strong>CS50x</strong>, Harvard Uni...",/course/introduction-computer-science-harvardx...,en
Basic Mandarin Chinese – Level 1,"{'cid': 'v2_full_en:19576', 'current_language'...",course-v1:MandarinX+MX101x+2T2016,<p>Introductory Mandarin is the first in a ser...,/course/basic-mandarin-chinese-level-1-mandari...,en
Accountant in Business,"{'cid': 'v2_full_en:33591', 'current_language'...",course-v1:ACCA+FAB-F1.x+3T2017,<p>Don’t have a business background but want t...,/course/accountant-business-acca-fab-f1-x-6,en
Android App Development for Beginners,"{'cid': 'v2_full_en:36566', 'current_language'...",course-v1:GalileoX+CAAD002X+2T2017,<p>This course is designed for students who ar...,/course/android-app-development-beginners-gali...,en


In [171]:
edx_df_english = edx_df[edx_df['language']=='en']
edx_df_english['provider'] = 'edx'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [172]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [206]:
tf_vectorizer = CountVectorizer(stop_words = stop_words, 
                             ngram_range = (1,3),
                                token_pattern='(?u)\\b[a-zA-z][a-zA-z]+\\b',
                            max_df=0.95, min_df=2)

tfidf_vectorizer = TfidfVectorizer(stop_words = stop_words, 
                             ngram_range = (1,3),
                                   token_pattern='(?u)\\b[a-zA-z][a-zA-z]+\\b',
                            max_df=0.95, min_df=2)

In [207]:
tf = tf_vectorizer.fit_transform(edx_df_english.text_concat)
tfidf = tfidf_vectorizer.fit_transform(edx_df_english.text_concat)

In [208]:

lda = LatentDirichletAllocation(n_topics=50)
lda.fit(tf)

tfidf_vectorizer._validate_vocabulary()
tf_feature_names = tf_vectorizer.get_feature_names()




In [201]:
print_top_words(lda, tf_feature_names, 10)

Topic #0:
web, android, microsoft, application, development, app, mobile, server, azure, windows
Topic #1:
writers, novel, patient, writing, thinking, cancer, disease, creative, art, prevention
Topic #2:
energy, solar, cells, technologies, power, systems, photovoltaic, engineering, solar energy, design
Topic #3:
physics, quantum, mechanics, org, atomic, amp, optical, www, www org, https
Topic #4:
urinary, respiratory, answer questions data, available statistical software, techniques answer, statistical techniques answer, questions data, data widely available, available statistical, basic statistical techniques
Topic #5:
newtonian, newtonian mechanics, assigned, orbits, planetary, unified overview, planetary orbits, multi concept, newtonian mechanics year, year introductory
Topic #6:
business, water, license, design, delft, model, cc, sa, commons, creative
Topic #7:
organic, semiconductors, materials, chemistry, devices, electronic, properties, structure, transport, models
Topic #8:
soc

In [202]:
nmf = NMF(n_components=20, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)


print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 10)


Topics in NMF model:
Topic #0:
design, business, social, development, learning, world, systems, technology, global, research
Topic #1:
link, contact, nofollow target follow, target follow link, target follow, class external link, external link, class external, follow link, nofollow target
Topic #2:
data abstraction, java, abstraction, programming, data, formal solutions, oriented framework, object oriented framework, object oriented, integrated development
Topic #3:
ap, ap physics, physics, prepare ap, ap biology, prepare, biology, advanced placement, placement, placement ap
Topic #4:
data sheets, sheets, sans, text, read discuss write, vocabulary language support, vocabulary language, understanding text, discuss write text, write text
Topic #5:
data, big data, big, analytics, analysis, data analysis, data science, statistical, machine learning, machine
Topic #6:
project, project management, management, projects, organizational, project manager, manager, capstone, closing, initiation


## trying pyldavis

In [198]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [212]:
pyLDAvis.sklearn.prepare(lda, tf ,tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
