# Домашна задача 3
## Обработка на природните јазици 2018/2019
### Андреј Јанчевски - 151003

In [2]:
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
import random
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer
import sklearn
import pandas
import time

### Задача 1

#### Преземање на податочното множество

In [2]:
nltk.download("movie_reviews")
nltk.download("punkt")
nltk.download('stopwords')
stops = set(stopwords.words('english'))
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\bani5\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bani5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bani5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bani5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews
categories = movie_reviews.categories()
documents = []
for category in categories:
    for file_id in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(file_id), category))
random.seed(57)
random.shuffle(documents)
num_documents=len(documents)
documents

[(['one', 'of', 'the', '90s', "'", 'most', 'unwelcome', ...], 'neg'),
 (['susan', 'granger', "'", 's', 'review', 'of', '"', ...], 'neg'),
 (['allen', ',', 'star', 'of', 'many', 'a', 'brian', ...], 'pos'),
 (['national', 'lampoon', "'", 's', 'animal', 'house', ...], 'pos'),
 (['blade', 'is', 'the', 'movie', 'that', 'shows', ...], 'pos'),
 (['a', 'cop', 'with', 'a', 'troubled', 'personal', ...], 'neg'),
 (['the', 'american', 'action', 'film', 'has', 'been', ...], 'pos'),
 (['good', 'films', 'are', 'hard', 'to', 'find', 'these', ...], 'pos'),
 (['after', 'the', 'recent', 'animated', 'debacles', ...], 'neg'),
 (['working', 'in', 'the', 'motion', 'picture', ...], 'neg'),
 (['the', 'tagline', 'for', 'this', 'film', 'is', ':', ...], 'neg'),
 (['what', 'were', 'they', 'thinking', '?', 'nostalgia', ...], 'neg'),
 (['surrounded', 'by', 'hype', ',', 'high', 'hopes', ',', ...], 'neg'),
 (['bob', 'the', 'happy', 'bastard', "'", 's', 'quickie', ...], 'pos'),
 (['ingredients', ':', 'neophyte', 'lawye

#### Пресметување на карактеристиките за множеството

In [4]:
def get_dataset(num_most_common,
                remove_stop_words=False,
                lemmatize=False,
                stem=False,
                tf_idf=False):
    all_words = [word.lower() for word in movie_reviews.words()]
    if remove_stop_words:
        all_words = [
            word for word in all_words if word not in stops and word.isalnum()
        ]
    lemmatizer = WordNetLemmatizer()
    if lemmatize:
        all_words = [lemmatizer.lemmatize(word, pos="a") for word in all_words]
    ps = PorterStemmer()
    if stem:
        all_words = [ps.stem(word) for word in all_words]
    all_words_freq = nltk.FreqDist(all_words)
    vocabulary = all_words_freq.most_common(50 + num_most_common)
    vocabulary = vocabulary[50:]
    dataset = [[], []]
    if tf_idf:
        collection = nltk.TextCollection(
            [documents[i][0] for i in range(0, num_documents)])
    for document, category in documents:
        if lemmatize:
            document = [
                lemmatizer.lemmatize(word, pos="a") for word in document
            ]
        if stem:
            document = [ps.stem(word) for word in document]
        if not tf_idf:
            document_freq = nltk.FreqDist(document)
            features = [
                document_freq[word] / freq for word, freq in vocabulary
            ]
        else:
            features = [
                collection.tf_idf(word, document) for word, _ in vocabulary
            ]
        dataset[0].append(features)
        dataset[1].append(category)
    return dataset

#### Поделба на податочното множество

In [5]:
def split_dataset(dataset):
    train_set = [dataset[0][:1500], dataset[1][:1500]]
    test_set = [dataset[0][1500:], dataset[1][1500:]]
    return train_set, test_set

#### Тренирање и тестирање на класификаторите

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [17]:
def train_and_test_models(train_set, test_set):
    scores = {}
    for i in range(0, 3):
        if i == 0:
            start_time = time.perf_counter()
            knn = KNeighborsClassifier(
                n_neighbors=20,
                weights="distance",
                algorithm="auto",
                metric="cosine",
                n_jobs=-1)
            knn.fit(train_set[0], train_set[1])
            scores.setdefault("KNN", [])
            scores["KNN"].append((knn.score(test_set[0], test_set[1]),
                                  time.perf_counter() - start_time))

            start_time = time.perf_counter()
            svm = SVC(
                C=1e8,
                kernel='rbf',
                gamma="scale",
                decision_function_shape="ovr",
                class_weight="balanced")
            svm.fit(train_set[0], train_set[1])
            scores.setdefault("SVM", [])
            scores["SVM"].append((svm.score(test_set[0], test_set[1]),
                                  time.perf_counter() - start_time))

            start_time = time.perf_counter()
            logistic_regression = LogisticRegression(
                C=1e8,
                solver='newton-cg',
                max_iter=1000,
                class_weight="balanced",
                n_jobs=-1)
            logistic_regression.fit(train_set[0], train_set[1])
            scores.setdefault("Logistic Regression", [])
            scores["Logistic Regression"].append((logistic_regression.score(
                test_set[0], test_set[1]), time.perf_counter() - start_time))

        start_time = time.perf_counter()
        random_forest = RandomForestClassifier(
            n_estimators=1000,
            criterion="entropy",
            min_samples_leaf=5,
            class_weight="balanced",
            n_jobs=-1)
        random_forest.fit(train_set[0], train_set[1])
        scores.setdefault("Random Forest", [])
        scores["Random Forest"].append((random_forest.score(
            test_set[0], test_set[1]), time.perf_counter() - start_time))

        start_time = time.perf_counter()
        neural_network = MLPClassifier(
            hidden_layer_sizes=(50, ),
            activation="tanh",
            solver="adam",
            learning_rate="adaptive",
            learning_rate_init=0.001,
            max_iter=1000,
            early_stopping=True,
            validation_fraction=0.1)
        neural_network.fit(train_set[0], train_set[1])
        scores.setdefault("Neural Network", [])
        scores["Neural Network"].append((neural_network.score(
            test_set[0], test_set[1]), time.perf_counter() - start_time))

    return scores


results = []

In [18]:
for num_most_common in range(500, 5000, 500):
    dataset = get_dataset(num_most_common)
    train_set, test_set = split_dataset(dataset)
    scores = train_and_test_models(train_set, test_set)
    for classifier, accuracies in scores.items():
        if len(accuracies) == 1:
            results.append([
                1, num_most_common, accuracies[0][1], classifier,
                accuracies[0][0], ""
            ])
            continue
        for i in range(0, 3):
            results.append([
                i + 1, num_most_common, accuracies[i][1], classifier,
                accuracies[i][0], ""
            ])
        averages = np.average(accuracies, axis=0)
        results.append([
            "Average", num_most_common, averages[1], classifier, averages[0],
            ""
        ])
    print(scores)

{'KNN': [(0.694, 2.075459099998625)], 'SVM': [(0.774, 1.4531935999984853)], 'Logistic Regression': [(0.766, 3.4254920999992464)], 'Random Forest': [(0.806, 1.394825800000035), (0.816, 1.404930400000012), (0.822, 1.2846406000007846)], 'Neural Network': [(0.794, 1.5728601999999228), (0.732, 0.6924945000009757), (0.77, 0.7208383999986836)]}
{'KNN': [(0.69, 0.13019170000006852)], 'SVM': [(0.782, 1.7819963000001735)], 'Logistic Regression': [(0.8, 3.041087499999776)], 'Random Forest': [(0.844, 1.756692099999782), (0.852, 1.7504798999998457), (0.854, 1.5270831999987422)], 'Neural Network': [(0.804, 0.6036624999997002), (0.792, 0.8389827999999397), (0.806, 1.17682109999987)]}
{'KNN': [(0.708, 0.21936710000045423)], 'SVM': [(0.812, 2.8707878999994136)], 'Logistic Regression': [(0.814, 1.562001500000406)], 'Random Forest': [(0.852, 1.6245164000010845), (0.832, 1.630780199999208), (0.84, 1.5195043000003352)], 'Neural Network': [(0.802, 0.9722615999999107), (0.812, 1.4308983000009903), (0.814, 0.

### Задача 2

In [19]:
for num_most_common in range(500, 5000, 500):
    dataset = get_dataset(num_most_common, remove_stop_words=True)
    train_set, test_set = split_dataset(dataset)
    scores = train_and_test_models(train_set, test_set)
    for classifier, accuracies in scores.items():
        if len(accuracies) == 1:
            results.append([
                1, num_most_common, accuracies[0][1], classifier,
                accuracies[0][0], "Excluded stopwords and non-word characters"
            ])
            continue
        for i in range(0, 3):
            results.append([
                i + 1, num_most_common, accuracies[i][1], classifier,
                accuracies[i][0], "Excluded stopwords and non-word characters"
            ])
        averages = np.average(accuracies, axis=0)
        results.append([
            "Average", num_most_common, averages[1], classifier, averages[0],
            "Excluded stopwords and non-word characters"
        ])
    print(scores)

{'KNN': [(0.692, 0.08500749999984691)], 'SVM': [(0.734, 2.6644753999989916)], 'Logistic Regression': [(0.722, 6.687809799999741)], 'Random Forest': [(0.78, 1.3858206999993854), (0.786, 1.6880492999989656), (0.786, 1.8798473000006197)], 'Neural Network': [(0.508, 0.3096253000003344), (0.768, 0.8515210999994451), (0.768, 0.955945799998517)]}
{'KNN': [(0.71, 0.14627590000054624)], 'SVM': [(0.75, 1.8547393000008014)], 'Logistic Regression': [(0.768, 1.7247739000013098)], 'Random Forest': [(0.824, 1.7261027000004106), (0.826, 1.5074475999990682), (0.822, 1.6424719999995432)], 'Neural Network': [(0.774, 0.9001575999991474), (0.812, 1.8618181000001641), (0.794, 0.9587705000012647)]}
{'KNN': [(0.72, 0.2259267000008549)], 'SVM': [(0.78, 3.6952806999997847)], 'Logistic Regression': [(0.806, 1.9059530000013183)], 'Random Forest': [(0.822, 1.8438833999989583), (0.838, 1.73754640000152), (0.836, 1.7313615000002756)], 'Neural Network': [(0.808, 1.8134953999997379), (0.814, 1.183265699999538), (0.816

### Задача 3

In [20]:
for num_most_common in range(500, 5000, 500):
    dataset = get_dataset(num_most_common, lemmatize=True)
    train_set, test_set = split_dataset(dataset)
    scores = train_and_test_models(train_set, test_set)
    for classifier, accuracies in scores.items():
        if len(accuracies) == 1:
            results.append([
                1, num_most_common, accuracies[0][1], classifier,
                accuracies[0][0], "Using lemmatization"
            ])
            continue
        for i in range(0, 3):
            results.append([
                i + 1, num_most_common, accuracies[i][1], classifier,
                accuracies[i][0], "Using lemmatization"
            ])
        averages = np.average(accuracies, axis=0)
        results.append([
            "Average", num_most_common, averages[1], classifier, averages[0],
            "Using lemmatization"
        ])
    print(scores)

{'KNN': [(0.7, 0.09187419999943813)], 'SVM': [(0.752, 1.4873363000006066)], 'Logistic Regression': [(0.75, 2.3479930000012246)], 'Random Forest': [(0.808, 1.5012074000005668), (0.816, 1.4315474999984872), (0.822, 1.3892728999999235)], 'Neural Network': [(0.786, 1.7144014999994397), (0.782, 0.789157699999123), (0.508, 0.3011201000008441)]}
{'KNN': [(0.696, 0.14948010000080103)], 'SVM': [(0.79, 1.7595232999992731)], 'Logistic Regression': [(0.792, 1.6268426999995427)], 'Random Forest': [(0.854, 1.5165271000005305), (0.846, 1.7172169000004942), (0.848, 1.615939100000105)], 'Neural Network': [(0.82, 2.1804271999990306), (0.79, 0.9276778000003105), (0.808, 0.7156794000002265)]}
{'KNN': [(0.73, 0.2164811999991798)], 'SVM': [(0.788, 4.574167599999782)], 'Logistic Regression': [(0.8, 1.747228800000812)], 'Random Forest': [(0.854, 1.635209500000201), (0.854, 1.7306091999998898), (0.846, 1.6204329000011057)], 'Neural Network': [(0.8, 0.6400164999995468), (0.812, 0.8336199999994278), (0.81, 1.363

### Задача 4

In [21]:
for num_most_common in range(500, 5000, 500):
    dataset = get_dataset(num_most_common, stem=True)
    train_set, test_set = split_dataset(dataset)
    scores = train_and_test_models(train_set, test_set)
    for classifier, accuracies in scores.items():
        if len(accuracies) == 1:
            results.append([
                1, num_most_common, accuracies[0][1], classifier,
                accuracies[0][0], "Using stemmed words"
            ])
            continue
        for i in range(0, 3):
            results.append([
                i + 1, num_most_common, accuracies[i][1], classifier,
                accuracies[i][0], "Using stemmed words"
            ])
        averages = np.average(accuracies, axis=0)
        results.append([
            "Average", num_most_common, averages[1], classifier, averages[0],
            "Using stemmed words"
        ])
    print(scores)

{'KNN': [(0.72, 0.08786519999921438)], 'SVM': [(0.744, 1.5435631000000285)], 'Logistic Regression': [(0.74, 3.8194909999983793)], 'Random Forest': [(0.816, 1.3886719000001904), (0.808, 1.390149200000451), (0.808, 1.3917762000019138)], 'Neural Network': [(0.758, 1.245292800002062), (0.76, 0.7222456999988935), (0.764, 0.4340044999989914)]}
{'KNN': [(0.72, 0.23687839999911375)], 'SVM': [(0.76, 1.7139036999978998)], 'Logistic Regression': [(0.768, 1.626897399997688)], 'Random Forest': [(0.852, 1.409171699997387), (0.848, 1.5076493000015034), (0.848, 1.4994922000005317)], 'Neural Network': [(0.782, 1.1971032000001287), (0.782, 0.7757199000006949), (0.79, 1.1498155999979645)]}
{'KNN': [(0.684, 0.217906399997446)], 'SVM': [(0.772, 3.612351199997647)], 'Logistic Regression': [(0.764, 1.9863560999983747)], 'Random Forest': [(0.846, 1.6230327999983274), (0.838, 1.8334398999977566), (0.84, 1.859666099997412)], 'Neural Network': [(0.802, 2.94716740000149), (0.764, 0.7060448999982327), (0.8, 2.3178

### Задача 5

In [22]:
for num_most_common in range(500, 5000, 500):
    dataset = get_dataset(
        num_most_common, remove_stop_words=True, lemmatize=True, stem=True)
    train_set, test_set = split_dataset(dataset)
    scores = train_and_test_models(train_set, test_set)
    for classifier, accuracies in scores.items():
        if len(accuracies) == 1:
            results.append([
                1, num_most_common, accuracies[0][1], classifier,
                accuracies[0][0],
                "Excluded stopwords and non-words+lemmatization+stemming"
            ])
            continue
        for i in range(0, 3):
            results.append([
                i + 1, num_most_common, accuracies[i][1], classifier,
                accuracies[i][0],
                "Excluded stopwords and non-words+lemmatization+stemming"
            ])
        averages = np.average(accuracies, axis=0)
        results.append([
            "Average", num_most_common, averages[1], classifier, averages[0],
            "Excluded stopwords and non-words+lemmatization+stemming"
        ])
    print(scores)

{'KNN': [(0.704, 0.08505049999803305)], 'SVM': [(0.73, 2.1081582000006165)], 'Logistic Regression': [(0.722, 5.38599659999818)], 'Random Forest': [(0.794, 1.407507900003111), (0.786, 1.3820592999982182), (0.792, 1.383310100001836)], 'Neural Network': [(0.774, 1.2517707999977574), (0.734, 0.621055299998261), (0.728, 0.5421199000011256)]}
{'KNN': [(0.698, 0.13400309999997262)], 'SVM': [(0.738, 1.7878171000011207)], 'Logistic Regression': [(0.756, 1.524373900003411)], 'Random Forest': [(0.828, 1.3945811000012327), (0.808, 1.4977651000008336), (0.82, 1.499621599999955)], 'Neural Network': [(0.772, 1.9092453000012028), (0.736, 1.3199001000029966), (0.736, 0.9966940999984217)]}
{'KNN': [(0.652, 0.20761880000281963)], 'SVM': [(0.762, 3.561082099997293)], 'Logistic Regression': [(0.768, 2.2367432999999437)], 'Random Forest': [(0.824, 1.627124200000253), (0.83, 1.7439293999996153), (0.828, 1.7325737000028312)], 'Neural Network': [(0.79, 3.34192880000046), (0.774, 0.8196637000000919), (0.782, 1.

### Задача 6

In [23]:
bag_of_words_dataset = get_dataset(2500, lemmatize=True)
train_set, test_set = split_dataset(bag_of_words_dataset)
best_model = random_forest = RandomForestClassifier(
    n_estimators=1000,
    criterion="entropy",
    min_samples_leaf=5,
    class_weight="balanced",
    n_jobs=-1)
best_model.fit(train_set[0], train_set[1])
best_model.score(test_set[0], test_set[1])

0.858

In [26]:
tf_idf_dataset = get_dataset(2500, lemmatize=True, tf_idf=True)
train_set, test_set = split_dataset(tf_idf_dataset)
best_model.fit(train_set[0], train_set[1])
best_model.score(test_set[0], test_set[1])

0.84

In [24]:
columns = [
    "Iteration", "Number of Features", "Duration (seconds)", "Classifier",
    "Accuracy", "Comment"
]
results_data_frame = pandas.DataFrame(data=results, columns=columns)
writer = pandas.ExcelWriter('results.xlsx')
results_data_frame.to_excel(writer, 'Results')
writer.save()

In [25]:
results_data_frame

Unnamed: 0,Iteration,Number of Features,Duration (seconds),Classifier,Accuracy,Comment
0,1,500,2.075459,KNN,0.694000,
1,1,500,1.453194,SVM,0.774000,
2,1,500,3.425492,Logistic Regression,0.766000,
3,1,500,1.394826,Random Forest,0.806000,
4,2,500,1.404930,Random Forest,0.816000,
5,3,500,1.284641,Random Forest,0.822000,
6,Average,500,1.361466,Random Forest,0.814667,
7,1,500,1.572860,Neural Network,0.794000,
8,2,500,0.692495,Neural Network,0.732000,
9,3,500,0.720838,Neural Network,0.770000,
