In [7]:
import os
from abc import ABC, abstractmethod

import pandas as pd
import numpy as np
import swifter
import nltk
import string
from typing import List

from nltk.corpus import stopwords
from nltk import PorterStemmer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import tensorflow as tf
import keras


# pre Process

In [3]:
def load_data(path: str) -> pd.DataFrame:
    rows_list = []
    _, authors, _ = next(os.walk(path))
    for author_name in authors:
        curr_row = {"author_name": author_name}
        author_path = os.path.join(path, author_name)
        _, _, books_files = next(os.walk(author_path))
        for book_name in books_files:
            curr_row["book_name"] = book_name
            with open(os.path.join(author_path, book_name), "r") as book:
                curr_row["book_text"] = book.read()
            rows_list.append(curr_row.copy())
    return pd.DataFrame(rows_list)

In [4]:
def chunck(df: pd.DataFrame, chunck_size: int = 500, text_colum: str = 'book_text') -> pd.DataFrame:
    rows = []
    for _, row in df.iterrows():
        num_chuncks = len(row[text_colum]) // chunck_size
        for i in range(num_chuncks - 1):
            tmp_row = row.copy()
            tmp_row[text_colum] = tmp_row[text_colum][i*chunck_size : (i+1)*chunck_size]
            rows.append(tmp_row.copy())
        tmp_row = row.copy()
        tmp_row[text_colum] = tmp_row[text_colum][(num_chuncks - 1)*chunck_size : ]
        rows.append(tmp_row.copy())
    return pd.DataFrame(rows)
        

In [5]:
df = load_data('C50/C50train')

In [6]:
df

Unnamed: 0,author_name,book_name,book_text
0,AaronPressman,106247newsML.txt,The Internet may be overflowing with new techn...
1,AaronPressman,120600newsML.txt,The U.S. Postal Service announced Wednesday a ...
2,AaronPressman,120683newsML.txt,Elementary school students with access to the ...
3,AaronPressman,136958newsML.txt,An influential Internet organisation has backe...
4,AaronPressman,137498newsML.txt,An influential Internet organisation has backe...
...,...,...,...
2495,WilliamKazer,28223newsML.txt,China's central bank chief has said that infla...
2496,WilliamKazer,282935newsML.txt,"China ushered in 1997, a year it has hailed as..."
2497,WilliamKazer,287736newsML.txt,China issued tough new rules on the handling o...
2498,WilliamKazer,289747newsML.txt,China will avoid bold moves in tackling its ai...


In [7]:
df2 = chunck(df)

In [8]:
df2

Unnamed: 0,author_name,book_name,book_text
0,AaronPressman,106247newsML.txt,The Internet may be overflowing with new techn...
0,AaronPressman,106247newsML.txt,"te, which collects reports directly from consu..."
0,AaronPressman,106247newsML.txt,"over $6 million, promising investors they cou..."
0,AaronPressman,106247newsML.txt,"omputer equipment, such as memory chips or sou..."
1,AaronPressman,120600newsML.txt,The U.S. Postal Service announced Wednesday a ...
...,...,...,...
2499,WilliamKazer,304402newsML.txt,prosperous.\nState television estimated 224 mi...
2499,WilliamKazer,304402newsML.txt,turned a backward Stalinist state into an eco...
2499,WilliamKazer,304402newsML.txt,the drama from an otherwise predictable serie...
2499,WilliamKazer,304402newsML.txt,returns Hong Kong to China this year.\nThe la...


In [9]:
df2['author_name'] = pd.factorize(df2['author_name'])[0]

In [10]:
df2

Unnamed: 0,author_name,book_name,book_text
0,0,106247newsML.txt,The Internet may be overflowing with new techn...
0,0,106247newsML.txt,"te, which collects reports directly from consu..."
0,0,106247newsML.txt,"over $6 million, promising investors they cou..."
0,0,106247newsML.txt,"omputer equipment, such as memory chips or sou..."
1,0,120600newsML.txt,The U.S. Postal Service announced Wednesday a ...
...,...,...,...
2499,49,304402newsML.txt,prosperous.\nState television estimated 224 mi...
2499,49,304402newsML.txt,turned a backward Stalinist state into an eco...
2499,49,304402newsML.txt,the drama from an otherwise predictable serie...
2499,49,304402newsML.txt,returns Hong Kong to China this year.\nThe la...


In [11]:
X, Y = pd.DataFrame(df2['book_text']), df2['author_name']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#  Features Extraction

## Dictionary Index

In [13]:
from nltk.corpus import words
nltk.download('words')

[nltk_data] Downloading package words to C:\Users\nir
[nltk_data]     son\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [84]:
 ps = PorterStemmer()
dictionary = dict(zip(words.words(), range(len(words.words()))))
def toIndexVector(text: str, vector_len: int = 150) -> np.array:
    vector = []
    for word in nltk.word_tokenize(text):
        stemed = ps.stem(word.lower())
        try:
            vector.append(dictionary[stemed])
        except:
            vector.append(-1)
        
    if(len(vector) < vector_len):
            vector = vector + [0]*(vector_len - len(vector))
    if(len(vector) > vector_len):
            vector = vector[:vector_len]
            
    return pd.Series(vector)    

In [85]:
x_train_index = x_train.copy(deep = True)
x_test_index = x_test.copy(deep = True)
x_test_index = pd.DataFrame(x_test_index['book_text'].swifter.apply(toIndexVector))
x_train_index = pd.DataFrame(x_train_index['book_text'].swifter.apply(toIndexVector))

  "This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`."


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




## bag of words

In [None]:
def __stem_text(text: str):
    ps = PorterStemmer()
    return (ps.stem(word) for word in word_tokenize(text))

In [68]:
vectorizer = CountVectorizer(min_df=0.003).fit(x_train['book_text'])

In [69]:
x_train_bag = pd.DataFrame(vectorizer.transform(x_train['book_text']).toarray())
x_test_bag = pd.DataFrame(vectorizer.transform(x_test['book_text']).toarray())

## word2vec

In [71]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [74]:
word2vec_output_file = '../glove.6B'+'.word2vec'
glove2word2vec('../glove.6B/glove.6B.50d.txt', word2vec_output_file)

  


(400000, 50)

In [75]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [98]:
 ps = PorterStemmer()
def word2vecDoc(text: str, doc_len: int = 100):
    vec = []
    for word in nltk.word_tokenize(text):
        try:
            we = model.get_vector(ps.stem(word.lower()))
        except:
            we = np.zeros(50)
        
        vec = np.concatenate([vec, we])
    
    if len(vec) < doc_len*50:
        vec = np.concatenate([vec, np.zeros(doc_len*50 - len(vec))])
    if len(vec) > doc_len*50:
        vec = vec[: doc_len*50]
    return pd.Series(vec)

In [99]:
x_train_word2vec = pd.DataFrame(x_train['book_text'].swifter.apply(word2vecDoc))
x_test_word2vec = pd.DataFrame(x_test['book_text'].swifter.apply(word2vecDoc))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




## doc2vec

In [104]:
from gensim.models import Word2Vec
from gensim import models
word2vec_path = '../GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [107]:
ps = PorterStemmer()
def doc2vec(text: str):
    vec = np.zeros(300)
    for word in nltk.word_tokenize(text):
        try:
            we = word2vec.get_vector(ps.stem(word.lower()))
        except:
            we = np.zeros(300)
        
        vec = vec + we
    return pd.Series(vec)

In [108]:
x_train_doc2vec = pd.DataFrame(x_train['book_text'].swifter.apply(doc2vec))
x_test_doc2vec = pd.DataFrame(x_test['book_text'].swifter.apply(doc2vec))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




## simple style

In [28]:
nltk.download('punkt')
TEXT_COLUMN_LABEL = 'book_text'
class FeaturesExtraction:
    @staticmethod
    def __stem_text(text: str):
        ps = PorterStemmer()
        return (ps.stem(word) for word in word_tokenize(text))

    @staticmethod
    def avg_word_len(df: pd.DataFrame, text_column_label: str = 'book_text') -> pd.DataFrame:
        avg_word_len = df[text_column_label].astype(str).swifter.apply(
            lambda s: pd.Series(nltk.word_tokenize(s)).map(len).mean()).rename("avg_word_len")
        return pd.DataFrame(avg_word_len)

    @staticmethod
    def avg_sentence_len(df: pd.DataFrame, text_column_label: str = 'book_text') -> pd.DataFrame:
        sentence_count = df[text_column_label].astype(str).swifter.apply(
            lambda text: pd.Series(nltk.sent_tokenize(text)).map(
                lambda sent: len(nltk.word_tokenize(sent))).mean()).rename("avg_sentence_len")

        return pd.DataFrame(sentence_count)

    @staticmethod
    def punctuation_marks(df: pd.DataFrame, text_column_label: str = 'book_text') -> pd.DataFrame:
        to_return = pd.DataFrame()
        i = 1
        for mark in list(string.punctuation):
            to_return[f'punctuation{i}'] = df[text_column_label].astype(str).apply(lambda s: s.count(mark) / len(s))
            i += 1
        return to_return

    @staticmethod
    def stop_words(df: pd.DataFrame, text_column_label: str = 'book_text') -> pd.DataFrame:
        to_return = pd.DataFrame()
        for word in list(stopwords.words('english')):
            to_return[word] = df[text_column_label].astype(str).apply(lambda s: s.count(word) / len(s))
        return to_return

    @staticmethod
    def pos_count(df: pd.DataFrame) -> pd.DataFrame:
        def group_pos(tag):
            groups = {"noun": ['NN', 'NNS', 'NNP', 'NNPS'], "verb": ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
                      "adverb": ['RB', 'RBR', 'RBS'], "adjective": ['JJ', 'JJR', 'JJS']}
            for key, group in groups.items():
                if tag in group:
                    return key
            return None

        features = df[TEXT_COLUMN_LABEL].astype(str).swifter.apply(
            lambda s: pd.Series([x[1] for x in nltk.pos_tag(nltk.word_tokenize(s))]).
                apply(group_pos).value_counts(normalize=True).copy())
        features = features.fillna(0)
        return features

[nltk_data] Downloading package punkt to C:\Users\nir
[nltk_data]     son\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
def simple_style(x) -> pd.DataFrame:
    x = pd.concat(
        [FeaturesExtraction.pos_count(x),
         FeaturesExtraction.stop_words(x),
         FeaturesExtraction.avg_word_len(x),
         FeaturesExtraction.avg_sentence_len(x),
         FeaturesExtraction.punctuation_marks(x)], axis=1)
    return x

In [30]:
x_train_simple_style = simple_style(x_train)
x_test_simple_style = simple_style(x_test)

  "This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`."


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9901.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=4244.0, style=ProgressStyle(descriptio…




# model

In [25]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [26]:
def pipeline(x_train: pd.DataFrame, x_test: pd.DataFrame):
    model = xgb.XGBClassifier(use_label_encoder=False)
    model.fit(x_train, y_train)
    pre = model.predict(x_test)
    print('accuracy = {}'.format((pre == y_test).mean()))

In [131]:
pipeline(x_train_index, x_test_index)

accuracy = 0.08953817153628653


In [133]:
pipeline(x_train_bag, x_test_bag)

accuracy = 0.6979264844486334


In [134]:
pipeline(x_train_word2vec, x_test_word2vec)

accuracy = 0.18355325164938738


In [132]:
pipeline(x_train_doc2vec, x_test_doc2vec)

accuracy = 0.5351083883129123


In [31]:
pipeline(x_train_simple_style, x_test_simple_style)

accuracy = 0.339066918001885


In [12]:
np.sum([[1,2,3],[1,2,3],[1,2,3]], axis=0)

array([3, 6, 9])