In [10]:
# Importing required modules
import pandas as pd

from preprocessing import PreProcessor
from model import Model

import keras
import tensorflow as tf

# Load Data

Dataset from: https://www.kaggle.com/code/anthonysusevski/course-classifier-tf/data

In [2]:
# Read CSV from downloaded file
df = pd.read_csv(
    '/home/std/Documents/repositories/analysis_wiki/course_data_clean.csv'
)

# Preprocessing

In [3]:
# Getting only required columns
df = df[['reviews', 'course_rating_int']]


# Drop rows with null values
df = df.dropna()

# Transform review to integer (0 or 1)
df['course_rating_int'] = df['course_rating_int'].astype(int)

In [4]:
# Chosing steps to be applied

steps = [
    'truncate_text',
    'convert_to_lower_case',
    #'remove_numbers',
    'remove_non_word_characters',
    'remove_urls',
    #'correct_spell',
    'lemmatization',
    'remove_stopwords',
    ]


In [5]:
#Applying preprocessing steps
pre_processor = PreProcessor(df['reviews'])
df['processed_reviews'] = pre_processor.run_pipeline(steps)

Running truncate_text
Running convert_to_lower_case
Running remove_non_word_characters
Running remove_urls
Running lemmatization


100%|████████████████████████████████████| 14608/14608 [01:03<00:00, 231.66it/s]


Running remove_stopwords


100%|████████████████████████████████████| 14608/14608 [01:02<00:00, 235.40it/s]


# Model Training

In [7]:
from parameters import params

In [11]:
md = Model(df, x='processed_reviews', y='course_rating_int')

In [12]:
md.tokenization()

In [19]:
model = keras.Sequential([
            tf.keras.layers.Embedding(params['model']['max_tokens'], 40, input_length=md.max_length),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
            tf.keras.layers.Dense(400, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(200, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(2, activation='sigmoid')
        ])

In [20]:
md.train(model, params)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 17, 40)            20000     
                                                                 
 bidirectional_1 (Bidirectio  (None, 512)              608256    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 400)               205200    
                                                                 
 dropout_2 (Dropout)         (None, 400)               0         
                                                                 
 dense_4 (Dense)             (None, 200)               80200     
                                                                 
 dropout_3 (Dropout)         (None, 200)               0         
                                                      

<keras.callbacks.History at 0x7f818831be50>

# To do

- Class to tune hyper parameters
    - Making easier to change them and comparing them
- Create functionality in the class to load model and continue training it
- Make class or functions to use the model and make predictions
- Create API to make a service that make the predictions