Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

<h1>Creating Reduced Datasets</h1>

In [None]:
#General libraries needed
import numpy as np
import pandas as pd

datasize = 20000

df = pd.DataFrame(columns=['itemid', 'title', 'Category', 'image_path']) #creating dataframe

all_df = pd.read_csv("train.csv")

# sort the dataframe
all_df.sort_values(by='Category', inplace=True)

# get a list of category
mobilelist=list(range(31, 58))
fashionlist=list(range(17,31))
beautylist=list(range(0, 17))

def retrievesample(all_df, list, df):
    eachdf = all_df.loc[all_df.Category.isin(list)]

    count_row = eachdf.shape[0]  # gives number of row count
    print("Original Count:", count_row)
    
    eachdf = eachdf.sample(datasize) #retrieve a sample

    count_row = eachdf.shape[0]  # gives number of row count
    print("New Sample Count:", count_row)
    
    df = df.append(eachdf, ignore_index=True) #append to original dataframe
    
    return df
    
df = retrievesample(all_df, mobilelist, df)
df = retrievesample(all_df, fashionlist, df)
df = retrievesample(all_df, beautylist, df)

df.to_csv('train2.csv', index=False, encoding='utf-8')

print("Success!")

<h1>Import all libraries and reading explored data into Dataframe</h1>

In [None]:
import re, io, gensim, datetime, time, nltk, random, pickle
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

#General libraries needed
import numpy as np
import pandas as pd

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

string = 'true:'
goldtruth = [string+str(i) for i in range(0, 58)]

string = 'pred:'
prediction = [string+str(i) for i in range(0, 58)]


def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=list(range(0, 58))), \
        index=goldtruth, 
        columns=prediction)
    print("Confusion Matrix:")
    print(cm)
    
    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)

<h1>Feature Selection</h1>

In [None]:
all_df = pd.read_csv("train2.csv",header = 0)
corpuslist = all_df["title"]

titles = []

for title in corpuslist:
    eachwordintitle = nltk.word_tokenize(title)
    titles += eachwordintitle

corpuslist = titles

#The below code is for the feature set definition. We are using only top 5000 words as our features 
fdist = nltk.FreqDist(w.lower() for w in corpuslist)

totaluniquewords = 0
for word in fdist:
    totaluniquewords+=1
print("Total Unique Words:", totaluniquewords)

datasize = 5000

mostcommonwords = fdist.most_common()[:datasize] #top 5k
mostcommonwords = [w[0] for w in mostcommonwords]

middlewords = fdist.most_common()[(datasize):(totaluniquewords-datasize)] #middle 5k
middlewords = [w[0] for w in middlewords]
middlewords = random.sample(middlewords, datasize)

leastcommonwords = fdist.most_common()[-datasize:] #bottom 5k
leastcommonwords = [w[0] for w in leastcommonwords]

print("Total Most Common Words:", len(mostcommonwords))
print(mostcommonwords[:10])

print("BREAK")

print("Total Middle Words:", len(middlewords))
print(middlewords[:10])

print("BREAK")

print("Total Least Common Words:", len(leastcommonwords))
print(leastcommonwords[:10])

<h1>Preprocessing</h1>

In [None]:
# Takes around 2 minute
all_df = pd.read_csv("train2.csv",header = 0)
corpus = all_df["title"]
labels = all_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    #docs2b = docs2 #no feature selection
    docs2b = [[w for w in doc if w in mostcommonwords] for doc in docs2] #selecting top 5k words as our features
    #docs2b = [[w for w in doc if w in middlewords] for doc in docs2] #selecting middle 5k words as our features
    #docs2b = [[w for w in doc if w in leastcommonwords] for doc in docs2] #selecting bottom 5k words as our features
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2b] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

<h1>Vectorise and TFIDF words</h1>

In [None]:
# Takes around 10 seconds
start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df

df = convertToDataframe(docs, labels.values.tolist())
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

<h1>Train-Test Split</h1>

In [None]:
X = df.loc[:, df.columns != 'Category'] #take everything except Category

y = df[['Category']] #our label is Category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

df = "" #clear memory

<h1>Naive Bayes to compare which features to select</h1>

In [None]:
naivebayes = MultinomialNB()
#Fit the training feature Xs and training label Ys
naivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = naivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

<h1>Save Dataframe to file</h1>

In [None]:
X_train.to_pickle("X_train")
X_test.to_pickle("X_test")
y_train.to_pickle("y_train")
y_test.to_pickle("y_test")