Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

<h1>Creating Reduced Datasets</h1>

In [None]:
#General libraries needed
import numpy as np
import pandas as pd

datasize = 20000

df = pd.DataFrame(columns=['itemid', 'title', 'Category', 'image_path']) #creating dataframe

all_df = pd.read_csv("train.csv")

# sort the dataframe
all_df.sort_values(by='Category', inplace=True)

# get a list of category
mobilelist=list(range(31, 58))
fashionlist=list(range(17,31))
beautylist=list(range(0, 17))

def retrievesample(all_df, list, df):
    eachdf = all_df.loc[all_df.Category.isin(list)]

    count_row = eachdf.shape[0]  # gives number of row count
    print("Original Count:", count_row)
    
    eachdf = eachdf.sample(datasize) #retrieve a sample

    count_row = eachdf.shape[0]  # gives number of row count
    print("New Sample Count:", count_row)
    
    df = df.append(eachdf, ignore_index=True) #append to original dataframe
    
    return df
    
df = retrievesample(all_df, mobilelist, df)
df = retrievesample(all_df, fashionlist, df)
df = retrievesample(all_df, beautylist, df)

df.to_csv('train2.csv', index=False, encoding='utf-8')

print("Success!")

<h1>Import all libraries and reading explored data into Dataframe</h1>

In [1]:
import re, io, gensim, datetime, time, nltk, random, pickle
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#General libraries needed
import numpy as np
import pandas as pd

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

string = 'true:'
goldtruth = [string+str(i) for i in range(0, 58)]

string = 'pred:'
prediction = [string+str(i) for i in range(0, 58)]


def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=list(range(0, 58))), \
        index=goldtruth, 
        columns=prediction)
    print("Confusion Matrix:")
    print(cm)
    
    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    print("Accuracy:", asr)
    print("F1:", f1)



<h1>Feature Selection</h1>

In [2]:
all_df = pd.read_csv("train2.csv",header = 0)
corpuslist = all_df["title"]

titles = []

for title in corpuslist:
    eachwordintitle = nltk.word_tokenize(title)
    titles += eachwordintitle

corpuslist = titles


stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
corpuslist = [w.lower() for w in corpuslist] #lower case the words
#corpuslist = [w for w in corpuslist if re.search('^[a-z]+$', w)] #lower case the words
#corpuslist = [w for w in corpuslist if w not in stop_list] #lower case the words
#corpuslist = [stemmer.stem(w) for w in corpuslist] #lower case the words
print(len(corpuslist))

fdist = nltk.FreqDist(w for w in corpuslist)

totaluniquewords = 0
for word in fdist:
    totaluniquewords+=1
print("Total Unique Words:", totaluniquewords)

datasize = 5000
#print(datasize)

mostcommonwords = fdist.most_common()[:datasize] #top 5k
mostcommonwords = [w[0] for w in mostcommonwords]

middlewords = fdist.most_common()[int(totaluniquewords/2-datasize/2):int(totaluniquewords/2+datasize/2)] #middle 5k
middlewords = [w[0] for w in middlewords]
middlewords = random.sample(middlewords, datasize)

leastcommonwords = fdist.most_common()[-datasize:] #bottom 5k
leastcommonwords = [w[0] for w in leastcommonwords]


print("Total Most Common Words:", len(mostcommonwords))
print(mostcommonwords[:10])

print("BREAK")

print("Total Middle Words:", len(middlewords))
print(middlewords[:10])

print("BREAK")

print("Total Least Common Words:", len(leastcommonwords))
print(leastcommonwords[:10])

555044
Total Unique Words: 22923
Total Most Common Words: 5000
['lengan', 'dress', 'wanita', 'untuk', 'neck', 'promo', 'cream', 'panjang', 'powder', 'model']
BREAK
Total Middle Words: 5000
['temprered', '18000mah', '082199992592', '9.0', 'z380', 'e5577', '042cs20869', '4430', 'k270', 'whasapp']
BREAK
Total Least Common Words: 5000
['24632', 'vto', 'i.193', '012', 'khairunnisa', 'butt', 'lucia', 'crg162033', 'varsity', 'fs2666']


<h1>Preprocessing</h1>

In [3]:
# Takes around 2 minute
corpus = all_df["title"]
labels = all_df["Category"]
itemid = all_df["itemid"]

start = time.time()

def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    #docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    #docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    #docs5 = [[stemmer.stem(w) for w in doc] for doc in docs4] #changing the words into its root form
    
    docs5=docs2
    #docs2b = docs5 #no feature selection
    docs2b = [[w for w in doc if w in mostcommonwords] for doc in docs5] #selecting top 5k words as our features
    #docs2b = [[w for w in doc if w in middlewords] for doc in docs5] #selecting middle 5k words as our features
    #docs2b = [[w for w in doc if w in leastcommonwords] for doc in docs5] #selecting bottom 5k words as our features
    docs2b =  [' '.join([w for w in doc]) for doc in docs2b]
    return docs2b

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

18.18136215209961
['iphone 5s 32gb gold', 'cash back 50 promo akhir thn beli 2 gratis 1 samsung galaxy s8 pul set wa', 'khusus hari ini 00 garansi resmi handphone oppo f1 s selfie expert 4gb terbaru', 'big promo cuci gudang dijual xiaomi redmi mi a1 black garansi resmi tam murah', 'vivo v5', 'xiaomi mi 5', 'cuci gudang macbook air 11 2015 128gb i5 wa o83i 3612 2666', 'samsung j1 ace terbaru', 'wa ke vivo y69 second', 'promo vivo v 11 original resmi']


<h1>Vectorise and TFIDF words</h1>

In [16]:
# Takes around 10 seconds
start = time.time()

def convertToDataframe(listofwords, labels,itemid):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns, index=itemid) #creating dataframe

    df['Category'] = labels
                      
    return df

df = convertToDataframe(docs, labels.values.tolist(),itemid)
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

                  00  000 0000 000022 00006 00009 0001 00059 00062 0007  \
itemid                                                                    
1587581279  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
782441790   0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1332229511  0.394864  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
829234495   0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
134104033   0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1525331944  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1294311382  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1543277075  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1315074632  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   
1666658224  0.000000  0.0  0.0    0.0   0.0   0.0  0.0   0.0   0.0  0.0   

             ...    zx550ml zx551ml zyfpgs  zyh zyrex zyx2002   zz zzsykd  \
itemid       ...      

<h1>Vectorise and TF words</h1>

In [4]:
start = time.time()

def convertToDataframe(listofwords, labels,itemid):
    vectorizer = CountVectorizer(analyzer='word') #tf
    words_tfidf = vectorizer.fit_transform(listofwords) #tf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns,index=itemid) #creating dataframe

    df['Category'] = labels
                      
    return df

df = convertToDataframe(docs, labels.values.tolist(),itemid)
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

           00 000 000mah 001 002 003 006 01 010 0119   ...    zipper zoom  \
itemid                                                 ...                  
1587581279  0   0      0   0   0   0   0  0   0    0   ...         0    0   
782441790   0   0      0   0   0   0   0  0   0    0   ...         0    0   
1332229511  1   0      0   0   0   0   0  0   0    0   ...         0    0   
829234495   0   0      0   0   0   0   0  0   0    0   ...         0    0   
134104033   0   0      0   0   0   0   0  0   0    0   ...         0    0   
1525331944  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1294311382  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1543277075  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1315074632  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1666658224  0   0      0   0   0   0   0  0   0    0   ...         0    0   

           zoya zr zs620kl zte zuk zv zyrex Category  
itemid              

<h1>Train-Test Split</h1>

In [5]:
X = df.loc[:, df.columns != 'Category'] #take everything except Category

y = df[['Category']] #our label is Category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

df = "" #clear memory

<h1>Naive Bayes to compare which features to select</h1>

In [6]:
naivebayes = MultinomialNB()
#Fit the training feature Xs and training label Ys
naivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = naivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0       15       9       9       3       8       4       1       3   
true:1        0     273       5      43      30      62       1       6   
true:2        0       2     123       8       8       1       0       3   
true:3        0      15       5     934      35      91       0       3   
true:4        0       1       0      51     419     101       0       8   
true:5        0       3       1      23     109     603       0       5   
true:6        0       0       0       4       0       0       0      23   
true:7        0       1       0       4      20      11       0     113   
true:8        0       0       0       7       2       5       0      14   
true:9        0       0       0       8      26      21       0       4   
true:10       0       0       0       1       8       0       0       0   
true:11       0       0       0      12       2       2       0      10   
true:12

  'precision', 'predicted', average, warn_for)


<h1>Save Dataframe to file</h1>

In [7]:
X_train.to_pickle("X_train_best_choice_7")
X_test.to_pickle("X_test_best_choice_7")
y_train.to_pickle("y_train_best_choice_7")
y_test.to_pickle("y_test_best_choice_7")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [9]:
print(X_train.head())

           00 000 000mah 001 002 003 006 01 010 0119   ...    zipper zoom  \
itemid                                                 ...                  
1677148667  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1066475273  0   0      0   0   0   0   0  0   0    0   ...         0    0   
176845790   0   0      0   0   0   0   0  0   0    0   ...         0    0   
1824729105  0   0      0   0   0   0   0  0   0    0   ...         0    0   
1656487109  0   0      0   0   0   0   0  0   0    0   ...         0    0   

           zoya zr zs620kl zte zuk zv zyrex Category  
itemid                                                
1677148667    0  0       0   0   0  0     0       32  
1066475273    0  0       0   0   0  0     0       35  
176845790     0  0       0   0   0  0     0        7  
1824729105    0  0       0   0   0  0     0       22  
1656487109    0  0       0   0   0  0     0        5  

[5 rows x 4908 columns]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [52]:
print(X_train.head())

           aa aaa aap ab abang abaya abg abh abi ablif   ...    zoya zp zpa  \
itemid                                                   ...                  
1677148667  0   0   0  0     0     0   0   0   0     0   ...       0  0   0   
1066475273  0   0   0  0     0     0   0   0   0     0   ...       0  0   0   
176845790   0   0   0  0     0     0   0   0   0     0   ...       0  0   0   
1824729105  0   0   0  0     0     0   0   0   0     0   ...       0  0   0   
1656487109  0   0   0  0     0     0   0   0   0     0   ...       0  0   0   

           zr zte zuk zv zyh zyrex Category  
itemid                                       
1677148667  0   0   0  0   0     0       32  
1066475273  0   0   0  0   0     0       35  
176845790   0   0   0  0   0     0        7  
1824729105  0   0   0  0   0     0       22  
1656487109  0   0   0  0   0     0        5  

[5 rows x 4983 columns]
