In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score, auc as sk_auc, roc_curve, precision_score, recall_score
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, StratifiedKFold
import pickle
import random
import re
import keras
import os
import mlflow
from mlflow import log_metric, log_param, log_artifact
import mlflow.sklearn

seed = 42
USE_IDF = True
NGRAM_RANGE = (1,1)
C = 4

Using TensorFlow backend.


In [2]:
df = pd.read_csv("data/course_descriptions.csv", usecols=[1,2,3,4,5,6])

In [3]:
df.sample(10)

Unnamed: 0,description,length,course,ouid,fac,inst
2524,Modelling with solid elements \r\r\nModelling ...,306,TEK2001,1220,IV,IV-IVB
1751,"For further information, see www.ntnu.edu/eit ...",85,BI2097,865,NV,NV-IBI
317,What is design?\r\r\nDesign in the 19th centur...,1726,IMT2342,840,AD,AD-ID
1060,"The course is given every other year, next tim...",3037,FY8203,867,NV,NV-IFY
3684,The course expands on and enhances the analysi...,940,MA1102,828,IE,IE-IMF
1962,Systems of linear equations:\r\r\n- Mtrices\r\...,1365,REA2071F,828,IE,IE-IMF
2643,Optimizing algorithms for both single and mult...,868,TDT4200,827,IE,IE-IDI
2937,- Overview of legal sources and legal methodol...,2597,AJ200115,1138,OK,OK-IIF
1807,The course will cover literary representations...,1172,NORD2312,1080,HF,HF-ISL
3108,This course combines artistic practice and cri...,1508,BK3181,813,AD,AD-KIT


In [4]:
df = df.dropna()

In [5]:
df.loc[df.course=="IT3708"].values

array([['The main focus of the course is to build intelligent systems based on two key natural concepts: evolution by natural selection and swarm intelligence.  Such intelligent systems have thousands of useful applications in fields as diverse as control theory, telecommunications, music and art.  This course discusses both methods in great detail along with providing a bit of the biological basis for each.Lecture slides, a textbook (possibly 2).  Textbooks are chosen  at the beginning of the semester.Students will get both theoretical and practical programming experience with two of the best known sub-symbolic AI methods: evolutionary algorithms and swarm intelligence algorithms. ',
        682, 'IT3708', 827, 'IE', 'IE-IDI']], dtype=object)

In [6]:
df.fac.value_counts()

IV    913
HF    634
IE    616
SU    504
NV    423
OK    384
MH    296
AD    213
Name: fac, dtype: int64

In [7]:
def remove_punctuation(document):
    return "".join([ (c if c not in string.punctuation+"\n\r\t" else " ") for c in document])

def tokenize(document):
    return [w.lower() for w in remove_punctuation(document).split(" ") if len(w)>0]

In [8]:
stoplist = [l.strip() for l in open("stopwords.txt", "r").readlines()]

In [9]:
stoplist

['a',
 'able',
 'about',
 'across',
 'after',
 'all',
 'almost',
 'also',
 'am',
 'among',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'but',
 'by',
 'can',
 'cannot',
 'could',
 'dear',
 'did',
 'do',
 'does',
 'either',
 'else',
 'ever',
 'every',
 'for',
 'from',
 'get',
 'got',
 'had',
 'has',
 'have',
 'he',
 'her',
 'hers',
 'him',
 'his',
 'how',
 'however',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'its',
 'just',
 'least',
 'let',
 'like',
 'likely',
 'may',
 'me',
 'might',
 'most',
 'must',
 'my',
 'neither',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'often',
 'on',
 'only',
 'or',
 'other',
 'our',
 'own',
 'rather',
 'said',
 'say',
 'says',
 'she',
 'should',
 'since',
 'so',
 'some',
 'than',
 'that',
 'the',
 'their',
 'them',
 'then',
 'there',
 'these',
 'they',
 'this',
 'tis',
 'to',
 'too',
 'twas',
 'us',
 'wants',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'would

In [10]:
y = df["fac"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(df["description"], y, stratify=y, random_state=seed)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2987,), (2987,), (996,), (996,))

In [11]:
y_test.value_counts()

IV    228
HF    159
IE    154
SU    126
NV    106
OK     96
MH     74
AD     53
Name: fac, dtype: int64

In [12]:
y_train.value_counts()

IV    685
HF    475
IE    462
SU    378
NV    317
OK    288
MH    222
AD    160
Name: fac, dtype: int64

In [13]:
vec = TfidfVectorizer(tokenizer=tokenize, stop_words=stoplist, use_idf=USE_IDF, ngram_range=NGRAM_RANGE)
trn_vec= vec.fit_transform(X_train.values)
test_vec = vec.transform(X_test.values)

In [14]:
X_train = pd.concat([X_train, pd.get_dummies(y_train)], axis=1)
X_test = pd.concat([X_test, pd.get_dummies(y_test)], axis=1)
X_train.shape, X_test.shape

((2987, 9), (996, 9))

In [15]:
X_train.sample(5)

Unnamed: 0,description,AD,HF,IE,IV,MH,NV,OK,SU
2830,NEVR2010 provides a thorough introduction to c...,0,0,0,0,1,0,0,0
529,Hypothesis testing. Simple and multiple linear...,0,0,1,0,0,0,0,0
787,Specialization is offered in a variety of topi...,0,0,0,1,0,0,0,0
564,The subject gives a broad introduction to the ...,0,1,0,0,0,0,0,0
1655,Didactics is the theoretical and practical kno...,0,0,0,0,0,0,0,1


In [16]:
label_cols = df["fac"].astype(str).unique().tolist()
print(label_cols)
preds = np.zeros((len(X_test), len(label_cols)))
preds.shape

['IE', 'HF', 'IV', 'AD', 'SU', 'MH', 'NV', 'OK']


(996, 8)

In [17]:
trn_vec.shape,test_vec.shape,  X_test.shape

((2987, 21404), (996, 21404), (996, 9))

In [18]:
def prior(y_i, y):
    '''
    y_i is either 1 or 0.
    y is numpy array of labels
    '''
    p = trn_vec[y==y_i].sum(0) # Number of documents in trn_doc with given label
    return (p+1) / ((y==y_i).sum()+1) # Fraction of all documents with given label

In [19]:
def get_mdl(y):
    y = y.values #pandas Series to numpy array
    r = np.log(prior(1,y) / prior(0,y)) #Log likelihood ratio for both possibilities
    m = LogisticRegression(C=C, solver="liblinear") # Logistic regression model
    x_nb = trn_vec.multiply(r) # Multiply the Tf-idf features with this ratio
    return m.fit(x_nb, y), r

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
clf = MultinomialNB()

In [22]:
clf.fit(trn_vec, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
p = clf.predict(test_vec)

In [24]:
(p == y_test).sum()

627

In [30]:
confusion_matrix(y_test, p, labels=label_cols)

array([[ 83,   1,  70,   0,   0,   0,   0,   0],
       [  2, 144,  10,   0,   3,   0,   0,   0],
       [  2,   0, 225,   0,   0,   0,   1,   0],
       [  2,   7,  42,   1,   1,   0,   0,   0],
       [  0,  11,  21,   0,  94,   0,   0,   0],
       [  1,   1,  34,   0,  10,  27,   1,   0],
       [  1,   1,  76,   0,   0,   0,  28,   0],
       [  1,   4,  64,   0,   2,   0,   0,  25]])

In [31]:
label_cols

['IE', 'HF', 'IV', 'AD', 'SU', 'MH', 'NV', 'OK']

In [32]:
y_test.value_counts()

IV    228
HF    159
IE    154
SU    126
NV    106
OK     96
MH     74
AD     53
Name: fac, dtype: int64

In [38]:
TRAIN = True
SAVE = False

In [39]:
models = {}
rs = {}
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(X_train[j])
    if TRAIN:
        preds[:,i] = m.predict_proba(test_vec.multiply(r))[:,1]
    else:
        if SAVE:
            np.save(open("webapp/r_"+j+".npy", "wb"), r)
            pickle.dump(m, open("webapp/"+j+"_model.pkl", "wb"))
    models[j] = m
    rs[j] = r

fit IE
fit HF
fit IV
fit AD
fit SU
fit MH
fit NV
fit OK


In [40]:
df_preds = pd.DataFrame(preds, columns=label_cols)

for c in df_preds.columns:
    df_preds[c+"_predicted"] = (df_preds.max(axis=1)==df_preds[c]).astype(int)

oh_y_test = pd.get_dummies(y_test)
oh_y_test = oh_y_test.rename({c:c+"_actual" for c in oh_y_test.columns},axis=1)

df_preds = pd.concat([df_preds, oh_y_test.reset_index()], axis=1)

res_df = pd.DataFrame(confusion_matrix(df_preds.loc[:,[c+"_actual" for c in label_cols]].values.argmax(1), df_preds.loc[:,[c+"_predicted" for c in label_cols]].values.argmax(1)), index=label_cols, columns=label_cols)

In [41]:
res_df

Unnamed: 0,IE,HF,IV,AD,SU,MH,NV,OK
IE,134,2,12,1,0,0,3,2
HF,1,151,1,0,4,0,0,2
IV,12,0,203,3,0,0,5,5
AD,8,1,5,35,3,0,0,1
SU,4,2,4,0,113,0,1,2
MH,2,0,4,0,5,60,3,0
NV,7,0,6,0,0,5,88,0
OK,4,4,5,0,7,0,0,76


In [42]:
precision_score(df_preds.loc[:,[c+"_actual" for c in label_cols]].values.argmax(1), df_preds.loc[:,[c+"_predicted" for c in label_cols]].values.argmax(1), average="micro")

0.8634538152610441

In [43]:
precision_score(df_preds.loc[:,[c+"_actual" for c in label_cols]].values.argmax(1), df_preds.loc[:,[c+"_predicted" for c in label_cols]].values.argmax(1), average="macro")

0.8736078613731231

In [27]:
precision_score(df_preds.loc[:,[c+"_actual" for c in label_cols]].values.argmax(1), df_preds.loc[:,[c+"_predicted" for c in label_cols]].values.argmax(1), average=None)

array([0.77906977, 0.94375   , 0.84583333, 0.8974359 , 0.85606061,
       0.92307692, 0.88      , 0.86363636])

## Next steps
- Define metrics
- Plot
- Script evolution
- Think about steps

In [3]:


# Log a parameter (key-value pair)
log_param("param1", 5)

# Log a metric; metrics can be updated throughout the run
log_metric("foo", 1)
log_metric("foo", 2)
log_metric("foo", 3)

# Log an artifact (output file)
with open("output.txt", "w") as f:
    f.write("Hello world!")
log_artifact("output.txt")
