In [9]:
import re, io, gensim, datetime, time, nltk
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

from sklearn.svm import LinearSVC

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

string = 'true:'
goldtruth = [string+str(i) for i in range(0, 58)]

string = 'pred:'
prediction = [string+str(i) for i in range(0, 58)]


def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=list(range(0, 58))), \
        index=goldtruth, 
        columns=prediction)
    print("Confusion Matrix:")
    print(cm)
    
    cm.to_csv('confusebu.csv')
    
    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """

In [3]:
# Takes around 7 minutes
all_df = pd.read_csv("train3.csv",header = 0)
corpus = all_df["title"]
labels = all_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

5.453259229660034
['cuci gudang balm voyag face palett', 'bayar di tempat pewarna ali gel peel tahan lama air alami krim alat makeup', 'puruan pesan ekarang origin balm nude tude palett murah kekinian grosir import', 'grosir cream spl paket normal garansi ori', 'best produk focallur color eyeshadow palett grati ongkir', 'balm hand pallet', 'dijual balm hand limit', 'palet morph warna make colour', 'new product white jade night cream free ongkir', 'terjamin cosrx bha blackhead power liquid']


In [4]:
# Takes around 2 minutes
start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df


df = convertToDataframe(docs, labels.values.tolist())
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

    aa   ab abadi abaya  abc  abh  abi abishopid absolu absolut   ...     \
0 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
1 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
2 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
3 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
4 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
5 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
6 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
7 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
8 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      
9 0.00 0.00  0.00  0.00 0.00 0.00 0.00      0.00   0.00    0.00   ...      

  zipper zncmrr zoya zozu   zr  ztv zurich  zvr  zze Category  
0   0.00   0.00 0.00 0.00 0.00 0.00   0.00 0.00 0.00        0  
1   0.00   0.00 0.00 0.00 0.00 0.00

In [5]:
X = df.loc[:, df.columns != 'Category'] #take everything except Category

y = df[['Category']] #our label is Category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

df = "" #clear memory

In [10]:
clf = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

clf.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = clf.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)




Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0      188       2       1       1       0       0       0       0   
true:1        0     147       2      15      17       0       3       4   
true:2        0       2     157       5       3       0       3       1   
true:3        0       8       6     155      11       0       3       2   
true:4        0       1       2      18     154       0       6       8   
true:5        0       0       0       0       0       0       0       0   
true:6        0       0       4       2       2       1     161      12   
true:7        0       0       0       4      12       0      11     161   
true:8        0       0       1       3       5       0      32      10   
true:9        0       0       0       7      11       0       0      11   
true:10       0       0       0       0       2       0       0       0   
true:11       0       0       0       0       0       0       4       4   
true:12

true:0     188
true:1       0
true:2       0
true:3       0
true:4       0
true:5       0
true:6       0
true:7       0
true:8       0
true:9       0
true:10      0
true:11      0
true:12      0
true:13      0
true:14      0
true:15      0
true:16      0
true:17      0
true:18      0
true:19      0
true:20      0
true:21      0
true:22      0
true:23      0
true:24      0
true:25      0
true:26      0
true:27      0
true:28      0
true:29      0
true:30      0
true:31      0
true:32      0
true:33      0
true:34      0
true:35      0
true:36      0
true:37      0
true:38      0
true:39      0
true:40      0
true:41      0
true:42      0
true:43      0
true:44      0
true:45      0
true:46      0
true:47      0
true:48      0
true:49      0
true:50      0
true:51      0
true:52      0
true:53      0
true:54      0
true:55      0
true:56      0
true:57      0
Name: pred:0, dtype: int64
true:0       2
true:1     147
true:2       2
true:3       8
true:4       1
true:5       0
true:6       

  'recall', 'true', average, warn_for)
