In [1]:
# Import python libraries
import numpy  as np
import pandas as pd

# Load dataset to pandas dataframe
df = pd.read_csv("Coursera.csv")

In [2]:
# Select the key columns that'll be used while building the model
courses = df[['Course Name','University','Difficulty Level','Course Rating','Course URL','Course Description','Skills']]

# Drop any null values from the database
courses.isnull().sum()
courses.dropna(inplace = True)

In [3]:
# Function to transform the data to a standard format
def convert(obj):
    s = list(obj)
    for i in range(len(s)):
      if s[i] == '|':
        s[i] = " "

    temp_str = "".join(s)
    temp_list = temp_str.split()

    return temp_list

# Apply the convert function to all the columns
courses['Course Name'] = courses['Course Name'].apply(convert)
courses['University'] = courses['University'].apply(convert)
courses['Difficulty Level'] = courses['Difficulty Level'].apply(convert)
courses['Course Rating'] = courses['Course Rating'].apply(convert)
courses['Course URL'] = courses['Course URL'].apply(convert)
courses['Course Description'] = courses['Course Description'].apply(convert)
courses['Skills'] = courses['Skills'].apply(convert)

# Stem the data to reduce words to their base form
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [4]:
# Create a new column that combines all characteristics of the courses
courses['tags'] = courses['Course Name'] + courses['University'] + courses['Difficulty Level'] + courses['Course Rating'] + courses['Course Description'] + courses['Skills']

# Create a new dataframe consisting of Course Name, University and tags
new_df = courses[['Course Name','University','tags']]
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Apply the stem function to the tags column of your dataframe
new_df.loc[:, 'tags'] = new_df['tags'].apply(stem)

In [5]:
# Perform text vectorization after disregarding stop words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000,stop_words='english')

# Transform the SciPy sparse matrix to NumPy array form
vectors = cv.fit_transform(new_df['tags']).toarray()

In [6]:
# Calculate the cosine similarity between the vectors
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [7]:
# Take the pickle dump of the results for later use
import pickle

pickle.dump(new_df,open('courses.pkl','wb'))
pickle.dump(new_df.to_dict(),open('course_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))