## 01 - Preparing Data for Classification

In [None]:
# Read course descriptions
with open("Course-Descriptions.txt", 'r') as fh:
    descriptions = fh.read().splitlines()

print("Number of descriptions :", len(descriptions))
print("\nSample course descriptions :", descriptions[:2])

Number of descriptions : 20

Sample course descriptions : ['In this practical, hands-on course, learn how to do data preparation, data munging, data visualization, and predictive analytics. ', 'PHP is the most popular server-side language used to build dynamic websites, and though it is not especially difficult to use, nonprogrammers often find it intimidating. ']


In [None]:
# Setup stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Setup wordnet for lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer

# Custom tokenizer that will perform tokenization, stopword removal and lemmatization
def customtokenize(str):
    tokens = nltk.word_tokenize(str)
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Generate TFIDF matrix
nltk.download('punkt_tab')

vectorizer = TfidfVectorizer(tokenizer=customtokenize)
tfidf = vectorizer.fit_transform(descriptions)

print("\nSample feature names identified : ", vectorizer.get_feature_names_out()[:25])
print("\nSize of TFIDF matrix : ",tfidf.shape)


Sample feature names identified :  ["'ll" "'re" "'s" '(' ')' ',' '.' '?' 'actively' 'adopting' 'amazon'
 'analysis' 'analytics' 'application' 'applied' 'architect' 'architecture'
 'around' 'aspect' 'associate' 'aws' 'basic' 'become' 'begin' 'big']

Size of TFIDF matrix :  (20, 238)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 02 - Building the model

In [None]:
# Loading the pre-built classifications for training
with open("Course-Classification.txt", 'r') as fh:
    classifications = fh.read().splitlines()

In [None]:
# Create Labels and integer classes
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(classifications)
print("Classes found : ", le.classes_)

Classes found :  ['Cloud-Computing' 'Data-Science' 'Programming']


In [None]:
# Convert classes to integers for use with ML
int_classes = le.transform(classifications)
print("\nClasses converted to integers :", int_classes)
print("\nActual classes :", classifications)


Classes converted to integers : [1 2 2 0 1 2 1 2 0 1 1 2 2 0 2 0 0 0 2 2]

Actual classes : ['Data-Science', 'Programming', 'Programming', 'Cloud-Computing', 'Data-Science', 'Programming', 'Data-Science', 'Programming', 'Cloud-Computing', 'Data-Science', 'Data-Science', 'Programming', 'Programming', 'Cloud-Computing', 'Programming', 'Cloud-Computing', 'Cloud-Computing', 'Cloud-Computing', 'Programming', 'Programming']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Split as training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(tfidf, int_classes,random_state=0)

# Build the model
classifier= MultinomialNB().fit(xtrain, ytrain)

## 03 - Running Predictions

In [None]:
tfidf.shape, xtrain.shape, xtest.shape, ytrain.shape

((20, 238), (15, 238), (5, 238), (15,))

In [None]:
from sklearn import metrics

print("Testing with Test Data :\n------------------------")
# Predict on test data
predictions=classifier.predict(xtest)
print("Confusion Matrix : ")
print(metrics.confusion_matrix(ytest, predictions))
print("\nPrediction Accuracy : ",  \
      metrics.accuracy_score(ytest, predictions) )

print("\nTesting with Full Corpus :\n--------------------------")

# Predict on entire corpus data
predictions=classifier.predict(tfidf)
print("Confusion Matrix : ")
print(metrics.confusion_matrix(int_classes, predictions))
print("\nPrediction Accuracy : ",  \
      metrics.accuracy_score(int_classes, predictions) )


Testing with Test Data :
------------------------
Confusion Matrix : 
[[1 0 0]
 [0 0 1]
 [1 0 2]]

Prediction Accuracy :  0.6

Testing with Full Corpus :
--------------------------
Confusion Matrix : 
[[6 0 0]
 [0 4 1]
 [1 0 8]]

Prediction Accuracy :  0.9
