In [None]:
DATA_FOLDER = 'data'
TRAIN_FILE = 'book_review_labelled_data.csv'
PRED_FILE = 'book_review_test_data_unlabelled.csv'

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
import os
import re
import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import spacy.attrs
import pickle 


from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin, clone
from tqdm.notebook import tqdm




In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("merge_entities", after="ner")
en_stopwords = nlp.Defaults.stop_words

In [None]:
cpu_count = min(int(os.cpu_count()), 4) if os.cpu_count() != None else 4

In [None]:
TRAIN_PATH = os.path.join(DATA_FOLDER, TRAIN_FILE)
PRED_PATH = os.path.join(DATA_FOLDER, PRED_FILE)

In [None]:
TRAIN_PATH
PRED_PATH

In [None]:
data = pd.read_csv(TRAIN_PATH)

In [None]:
def convert_series2docs(nlp_, data_: pd.Series, num_processes: int):
    docs_ = list(tqdm(nlp_.pipe(data_, batch_size=20, n_process=num_processes), total=len(data_)))
    return docs_

In [None]:
docs = convert_series2docs(nlp, data['reviewText'], cpu_count-1)

In [None]:
def dataframe_analysis(df_: pd.DataFrame):
    info_df = df_.info()
    describe_df = df_.describe()
    value_counts_df_ = [df_[column].value_counts() for column in df_.columns]
    return info_df, describe_df, value_counts_df_

In [None]:
dataframe_analysis(data)