Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

<h1>Import all libraries and reading explored data into Dataframe</h1>

In [11]:
import re, io, gensim, datetime, time, nltk
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), \
        index=['true:0', 'true:1', 'true:2', 'true:3', 'true:4', 'true:5', 'true:6', 'true:7', 'true:8', 'true:9', 'true:10', 'true:11'], 
        columns=['pred:0', 'pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5', 'pred:6', 'pred:7', 'pred:8', 'pred:9', 'pred:10', 'pred:11'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """

<h1>Preprocessing</h1>

In [2]:
# Takes around 7 minutes
all_df = pd.read_csv("train.csv",header = 0)
corpus = all_df["title"]
labels = all_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

216.2837860584259
['nyx sex bomb pallet natur palett', 'etud hous preciou miner cushion pearl aura puff', 'milani rose powder blush', 'etud hous babi sweet sugar powder', 'bedak revlon color stay aqua miner make', 'dr pure whiten cream', 'chanel powder blush malic', 'snail white cream origin', 'sunpris proof spf', 'eyebrow powder nyx satuan rp pc']


<h1>Vectorise and TFIDF words</h1>

In [3]:
# Takes around 2 minutes
start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df


df = convertToDataframe(docs[0:50000], labels.values.tolist()[0:50000]) #limit for 50k rows
print(df.head(10))

end = time. time()
print(end - start)

    aa  aaa  aag  aaw   ab abadi abal  abc  abh abil   ...      zs  zsc   zt  \
0 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
1 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
2 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
3 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
4 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
5 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
6 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
7 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
8 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   
9 0.00 0.00 0.00 0.00 0.00  0.00 0.00 0.00 0.00 0.00   ...    0.00 0.00 0.00   

   ztv  zvf zwain zwgb zwitsal zxcv Category  
0 0.00 0.00  0.00 0.00    0.00 0.00        0  
1 0.00 0.00  0.00 0.00   

<h1>Train-Test Split</h1>

In [4]:
X = df.loc[:, df.columns != 'Category'] #take everything except Category

y = df[['Category']] #our label is Category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

<h1>Baseline Classifier (Decision Tree)</h1>

In [13]:
parameters = {
    'max_depth' : list(range(1, 5))
}

decisionTree = GridSearchCV(DecisionTreeClassifier(), cv=3, param_grid=parameters, scoring='f1_macro')
#Fit the training feature Xs and training label Ys
decisionTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = decisionTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",decisionTree.best_params_)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      180       0       0       0       0       0       0       0   
true:1        0    1051       0       0       0       0       0       0   
true:2        0       0     448       0       0       0       0       0   
true:3        0       0       0    3158       0       0       0       0   
true:4        0       0       0       0    1654       0       0       0   
true:5        0       0       0       0       0    2150       0       0   
true:6        0       0       0       0       0       0       0      82   
true:7        0       0       0       0       0       0       0     464   
true:8        0       0       0       0       0       0       0       0   
true:9        0       0       0       0       0       0       0       0   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   

      

  'precision', 'predicted', average, warn_for)


<h1>AdaBoost (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [AdaBoost Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/adaboost-classifier-python)
- [Setting Learning Rate and N Estimators](https://stats.stackexchange.com/questions/82323/shrinkage-parameter-in-adaboost)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 


In [14]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
adaboostTree = AdaBoostClassifier(n_estimators=50,learning_rate=1)

#Fit the training feature Xs and training label Ys
adaboostTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      180       0       0       0       0       0       0       0   
true:1        0    1051       0       0       0       0       0       0   
true:2        0       0       0     448       0       0       0       0   
true:3        0       0       0    3158       0       0       0       0   
true:4        0       0       0       0    1654       0       0       0   
true:5        0       0       0       0       0    2150       0       0   
true:6        0       0       0       0       0       0      82       0   
true:7        0       0       0       0       0       0       0     464   
true:8        0       0       0       0       0       0       0     239   
true:9        0       0       0       0       0       0       0     361   
true:10       0       0       0       0       0       0       0      47   
true:11       0       0       0       0       0       0       0       0   

      

  'precision', 'predicted', average, warn_for)


<h1>AdaBoost (with Naive Bayes)</h1>

Refer to the following links on for detail explanation on the implementation:
- [Gaussian Naive Bayes Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)
- [Naive Bayes Classifier video](https://www.youtube.com/watch?v=CPqOCI0ahss)

In [15]:
naivebayes = MultinomialNB()
#Fit the training feature Xs and training label Ys
naivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = naivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0       29      78      10      20      28       6       0       1   
true:1        0     585       4     217      55     168       0      17   
true:2        0       3     260     106      29      28       0      10   
true:3        0       3       2    2761     103     277       0       6   
true:4        0       0       0     172    1159     281       0      29   
true:5        0       0       0     122     247    1775       0       6   
true:6        0       0       0      20      10       3       0      44   
true:7        0       0       0      47      64      98       0     252   
true:8        0       0       0      86      31      26       0      56   
true:9        0       0       0      48      85     156       0       7   
true:10       0       0       0       4      24      19       0       0   
true:11       0       0       0      55      33      49       0      27   

      

  'precision', 'predicted', average, warn_for)


In [16]:
nb = MultinomialNB()

adaboostnaivebayes = AdaBoostClassifier(n_estimators=50,learning_rate=1, base_estimator=nb)
#model = BaggingClassifier(n_estimators=50, base_estimator=knn)

#Fit the training feature Xs and training label Ys
adaboostnaivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostnaivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0        0       0       0       0     180       0       0       0   
true:1        0       0       0       0    1051       0       0       0   
true:2        0       0       0       0     448       0       0       0   
true:3        0       0       0       0       0    3158       0       0   
true:4        0       0       0       0       0    1654       0       0   
true:5        0       0       0       0       0    2150       0       0   
true:6        0       0       0       0       0       0      82       0   
true:7        0       0       0       0       0       0       0       0   
true:8        0       0       0       0       0       0       0       5   
true:9        0       0       0       0       0       0       0     361   
true:10       0       0       0       0       0       0       0       0   
true:11       0       0       0       0       0       0       0       0   

      

  'precision', 'predicted', average, warn_for)
