## ML text classification

IMDB Dataset: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews#IMDB%20Dataset.csv

### Load data

In [1]:
import re
import csv
from sklearn.metrics import accuracy_score

In [2]:
def load_data(filename, delimiter):
    samples = []
    labels = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='"')
        next(reader) #skip csv header
        for row in reader:
            samples += [row[0]]
            labels += [row[1]]

    preprocessed_samples = []
    for sample in samples:
        s = sample.lower()
        s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
        s = re.sub("\s+", " ", s)
        s = s.strip()
        preprocessed_samples += [s]
    
    tokenized_samples = []
    for sample in preprocessed_samples:
        s = sample.split()
        tokenized_samples += [s]
        
    word2id = {}
    id2word = []

    for sample in tokenized_samples:
        for token in sample:
            if token not in word2id.keys():
                word2id[token] = len(id2word)
                id2word += [token]
    
    digitized_samples = []
    for sample in tokenized_samples:
        s = [word2id[token] for token in sample]
        digitized_samples += [s]
        
    correct_samples = []
    max_len = 128

    for sample in digitized_samples:
        if len(sample) < max_len:
            sample += [0] * (max_len - len(sample))

        correct_samples += [sample[:max_len]]
        
    labels_dict = {"negative": 0, "positive": 1}

    correct_labels = [labels_dict[label] for label in labels]
    
    train_data = correct_samples[:4000]
    train_labels = correct_labels[:4000]
    test_data = correct_samples[4000:]
    test_labels = correct_labels[4000:]
    
    return train_data, train_labels, test_data, test_labels

In [3]:
train_data, train_labels, test_data, test_labels = load_data('data/imdb/imdb_dataset.csv', ',')

### Create model and fit

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
lr_cls = LogisticRegression()
lr_cls.fit(train_data, train_labels)

LogisticRegression()

In [6]:
preds = lr_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.5023333333333333

# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
dt_cls = DecisionTreeClassifier()
dt_cls.fit(train_data, train_labels)

DecisionTreeClassifier()

In [9]:
preds = dt_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.497

In [10]:
from sklearn import tree
tree.export_graphviz(dt_cls)

'digraph Tree {\nnode [shape=box] ;\n0 [label="X[87] <= 50.5\\ngini = 0.5\\nsamples = 4000\\nvalue = [2027, 1973]"] ;\n1 [label="X[78] <= 48.5\\ngini = 0.497\\nsamples = 1428\\nvalue = [658, 770]"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="X[24] <= 3015.5\\ngini = 0.481\\nsamples = 608\\nvalue = [245, 363]"] ;\n1 -> 2 ;\n3 [label="X[58] <= 65.5\\ngini = 0.492\\nsamples = 494\\nvalue = [216, 278]"] ;\n2 -> 3 ;\n4 [label="X[16] <= 429.0\\ngini = 0.5\\nsamples = 259\\nvalue = [132, 127]"] ;\n3 -> 4 ;\n5 [label="X[20] <= 4.5\\ngini = 0.486\\nsamples = 151\\nvalue = [63, 88]"] ;\n4 -> 5 ;\n6 [label="X[28] <= 1292.5\\ngini = 0.305\\nsamples = 16\\nvalue = [13, 3]"] ;\n5 -> 6 ;\n7 [label="gini = 0.0\\nsamples = 12\\nvalue = [12, 0]"] ;\n6 -> 7 ;\n8 [label="X[6] <= 17.5\\ngini = 0.375\\nsamples = 4\\nvalue = [1, 3]"] ;\n6 -> 8 ;\n9 [label="gini = 0.0\\nsamples = 1\\nvalue = [1, 0]"] ;\n8 -> 9 ;\n10 [label="gini = 0.0\\nsamples = 3\\nvalue = [0, 3]"] ;\n8 -> 10

# Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

In [12]:
nb_cls = GaussianNB()
nb_cls.fit(train_data, train_labels)

GaussianNB()

In [13]:
preds = nb_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.5146666666666667

# k-nearest neighbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
knn_cls = KNeighborsClassifier()
knn_cls.fit(train_data, train_labels)

KNeighborsClassifier()

In [16]:
preds = knn_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.503

# Support Vector Machines

In [17]:
from sklearn.svm import LinearSVC

In [18]:
svm_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_cls.fit(train_data, train_labels)

LinearSVC(dual=False)

In [19]:
preds = svm_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.5023333333333333

# Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf_cls = RandomForestClassifier(max_depth=2, random_state=0)
rf_cls.fit(train_data, train_labels)

RandomForestClassifier(max_depth=2, random_state=0)

In [22]:
preds = rf_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.5273333333333333

# Boosting

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [24]:
ab_cls = AdaBoostClassifier(n_estimators=100)
ab_cls.fit(train_data, train_labels)

AdaBoostClassifier(n_estimators=100)

In [25]:
preds = ab_cls.predict(test_data)
accuracy_score(test_labels, preds)

0.5225

# One-Hot

In [31]:
def load_data_onehot(filename, delimiter):
    samples = []
    labels = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='"')
        next(reader) #skip csv header
        for row in reader:
            samples += [row[0]]
            labels += [row[1]]

    preprocessed_samples = []
    for sample in samples:
        s = sample.lower()
        s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
        s = re.sub("\s+", " ", s)
        s = s.strip()
        preprocessed_samples += [s]

        
    labels_dict = {"negative": 0, "positive": 1}

    correct_labels = [labels_dict[label] for label in labels]
    
    train_data = preprocessed_samples[:4000]
    train_labels = correct_labels[:4000]
    test_data = preprocessed_samples[4000:]
    test_labels = correct_labels[4000:]
    
    return train_data, train_labels, test_data, test_labels

In [36]:
oh_train_data, oh_train_labels, oh_test_data, oh_test_labels = load_data_onehot('data/imdb/imdb_dataset.csv', ',')

In [37]:
from sklearn.feature_extraction.text import CountVectorizer  

In [38]:
oh_vectorizer = CountVectorizer(binary=True)
oh_vectorizer = oh_vectorizer.fit(oh_train_data)

oh_train_data = oh_vectorizer.transform(oh_train_data)
oh_test_data = oh_vectorizer.transform(oh_test_data)

In [39]:
svm_oh_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_oh_cls.fit(oh_train_data, oh_train_labels)

LinearSVC(dual=False)

In [40]:
preds = svm_oh_cls.predict(oh_test_data)
accuracy_score(oh_test_labels, preds)

0.8441666666666666

# TF-IDF

![alt text](tf-idf.png "Title")

In [41]:
def load_data_tfidf(filename, delimiter):
    samples = []
    labels = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='"')
        next(reader) #skip csv header
        for row in reader:
            samples += [row[0]]
            labels += [row[1]]

    preprocessed_samples = []
    for sample in samples:
        s = sample.lower()
        s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
        s = re.sub("\s+", " ", s)
        s = s.strip()
        preprocessed_samples += [s]

        
    labels_dict = {"negative": 0, "positive": 1}

    correct_labels = [labels_dict[label] for label in labels]
    
    train_data = preprocessed_samples[:4000]
    train_labels = correct_labels[:4000]
    test_data = preprocessed_samples[4000:]
    test_labels = correct_labels[4000:]
    
    return train_data, train_labels, test_data, test_labels

In [42]:
text_train_data, tf_idf_train_labels, text_test_data, tf_idf_test_labels = load_data_tfidf('data/imdb/imdb_dataset.csv', ',')

In [43]:
text_train_data[0]

'one of the other reviewers has mentioned that after watching just 1 oz episode you ll be hooked they are right as this is exactly what happened with me br br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word br br it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away br br i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t 

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tfidf = TfidfVectorizer()
tfidf = tfidf.fit(text_train_data)
tf_idf_train_data = tfidf.transform(text_train_data)
tf_idf_test_data = tfidf.transform(text_test_data)

In [46]:
tf_idf_train_data[0]

<1x35155 sparse matrix of type '<class 'numpy.float64'>'
	with 186 stored elements in Compressed Sparse Row format>

In [47]:
svm_tfid_cls = LinearSVC(loss="squared_hinge", C=1.0, dual=False)
svm_tfid_cls.fit(tf_idf_train_data, train_labels)

LinearSVC(dual=False)

In [48]:
preds = svm_tfid_cls.predict(tf_idf_test_data)
accuracy_score(tf_idf_test_labels, preds)

0.868

In [49]:
from sklearn.ensemble import AdaBoostClassifier

In [50]:
ab_cls = AdaBoostClassifier(n_estimators=1000)
ab_cls.fit(tf_idf_train_data, train_labels)

AdaBoostClassifier(n_estimators=1000)

In [51]:
preds = ab_cls.predict(tf_idf_test_data)
accuracy_score(test_labels, preds)

0.8206666666666667