Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

<h1>Import all libraries and reading explored data into Dataframe</h1>

In [6]:
import re, io, gensim, datetime, time, nltk
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

string = 'true:'
goldtruth = [string+str(i) for i in range(0, 58)]

string = 'pred:'
prediction = [string+str(i) for i in range(0, 58)]


def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=list(range(0, 58))), \
        index=goldtruth, 
        columns=prediction)
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """

<h1>Preprocessing</h1>

In [2]:
# Takes around 7 minutes
all_df = pd.read_csv("train2.csv",header = 0)
corpus = all_df["title"]
labels = all_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

26.111504316329956
['cuci gudang balm voyag face palett', 'bayar di tempat pewarna ali gel peel tahan lama air alami krim alat makeup', 'puruan pesan ekarang origin balm nude tude palett murah kekinian grosir import', 'grosir cream spl paket normal garansi ori', 'best produk focallur color eyeshadow palett grati ongkir', 'balm hand pallet', 'dijual balm hand limit', 'palet morph warna make colour', 'new product white jade night cream free ongkir', 'terjamin cosrx bha blackhead power liquid']


<h1>Vectorise and TFIDF words</h1>

In [3]:
# Takes around 2 minutes
start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df


df = convertToDataframe(docs, labels.values.tolist())
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

    aa  aaa  aai   ab ababa abadi abang abaya  abb abbil   ...     zuk zuki  \
0 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
1 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
2 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
3 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
4 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
5 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
6 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
7 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
8 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   
9 0.00 0.00 0.00 0.00  0.00  0.00  0.00  0.00 0.00  0.00   ...    0.00 0.00   

  zulfa zurich   zv  zvr  zvv zyrex  zze Category  
0  0.00   0.00 0.00 0.00 0.00  0.00 0.00        0  
1  0.00   0.00 0.00 0.00 0

<h1>Train-Test Split</h1>

In [4]:
X = df.loc[:, df.columns != 'Category'] #take everything except Category

y = df[['Category']] #our label is Category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

df = "" #clear memory

<h1>Baseline Classifier (Decision Tree)</h1>

In [7]:
parameters = {
    'max_depth' : list(range(1, 5))
}

decisionTree = GridSearchCV(DecisionTreeClassifier(), cv=3, param_grid=parameters, scoring='f1_macro')
#Fit the training feature Xs and training label Ys
decisionTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = decisionTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",decisionTree.best_params_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      202       0       0       0       0       0       0       0   
true:1        0     171       0       0       0       0       0       0   
true:2        0       0     200       0       0       0       0       0   
true:3        0       0       0     185       0       0       0       0   
true:4        0       0       0       0       0     209       0       0   
true:5        0       0       0       0       0     206       0       0   
true:6        0       0       0       0       0       0     189       0   
true:7        0       0       0       0       0       0       0       0   
true:8        0       0       0       0       0       0       0       0   
true:9        0       0       0       0       0       0       0       0   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   
true:12

  'precision', 'predicted', average, warn_for)


<h1>AdaBoost (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [AdaBoost Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/adaboost-classifier-python)
- [Setting Learning Rate and N Estimators](https://stats.stackexchange.com/questions/82323/shrinkage-parameter-in-adaboost)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 


In [8]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
adaboostTree = AdaBoostClassifier(n_estimators=50,learning_rate=1)

#Fit the training feature Xs and training label Ys
adaboostTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      202       0       0       0       0       0       0       0   
true:1        0     171       0       0       0       0       0       0   
true:2        0       0       0     200       0       0       0       0   
true:3        0       0       0     185       0       0       0       0   
true:4        0       0       0     209       0       0       0       0   
true:5        0       0       0     206       0       0       0       0   
true:6        0       0       0       0       0       0     189       0   
true:7        0       0       0       0       0       0     206       0   
true:8        0       0       0       0       0       0       0       0   
true:9        0       0       0       0       0       0       0       0   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   
true:12

  'precision', 'predicted', average, warn_for)


<h1>AdaBoost (with Naive Bayes)</h1>

Refer to the following links on for detail explanation on the implementation:
- [Gaussian Naive Bayes Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)
- [Naive Bayes Classifier video](https://www.youtube.com/watch?v=CPqOCI0ahss)

In [9]:
naivebayes = MultinomialNB()
#Fit the training feature Xs and training label Ys
naivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = naivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      119       8       2       6      30       1      15       5   
true:1        0      63       2       5       9      45       7      19   
true:2        0       0     104       2       0       2       6       7   
true:3        0       0       0      80       0      16       1       7   
true:4        0       0       0       0       4       8       1       4   
true:5        0       0       0       0       0      68       0       4   
true:6        0       0       0       0       0       0      20       5   
true:7        0       0       0       0       0       0       1      71   
true:8        0       0       0       0       0       0       0       1   
true:9        0       0       0       0       0       0       0       2   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   
true:12

  'precision', 'predicted', average, warn_for)


In [10]:
nb = MultinomialNB()

adaboostnaivebayes = AdaBoostClassifier(n_estimators=50,learning_rate=1, base_estimator=nb)
#model = BaggingClassifier(n_estimators=50, base_estimator=knn)

#Fit the training feature Xs and training label Ys
adaboostnaivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostnaivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0        0       0       0       0       0       0       0       0   
true:1        0       0       0       0       0       0       0       0   
true:2        0       0       0       0       0       0       0       0   
true:3        0       0       0       0       0       0       0       0   
true:4        0       0       0       0       0       0       0       0   
true:5        0       0       0       0       0       0       0       0   
true:6        0       0       0       0       0       0       0       0   
true:7        0       0       0       0       0       0       0       0   
true:8        0       0       0       0       0       0       0       0   
true:9        0       0       0       0       0       0       0       0   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   
true:12

  'precision', 'predicted', average, warn_for)
