# Assignment 5 - Text Analysis
An explanation this assignment could be found in the .pdf explanation document


## Materials to review for this assignment
<h4>From Moodle:</h4> 
<h5><u>Review the notebooks regarding the following python topics</u>:</h5>
<div class="alert alert-info">
&#x2714; <b>Working with strings</b> (tutorial notebook)<br/>
&#x2714; <b>Text Analysis</b> (tutorial notebook)<br/>
&#x2714; <b>Hebrew text analysis tools (tokenizer, wordnet)</b> (moodle example)<br/>
&#x2714; <b>(brief review) All previous notebooks</b><br/>
</div> 
<h5><u>Review the presentations regarding the following topics</u>:</h5>
<div class="alert alert-info">
&#x2714; <b>Text Analysis</b> (lecture presentation)<br/>
&#x2714; <b>(brief review) All other presentations</b><br/>
</div>

## Preceding Step - import modules (packages)
This step is necessary in order to use external modules (packages). <br/>

In [None]:
# --------------------------------------
import pandas as pd
import numpy as np
# --------------------------------------


# --------------------------------------
# ------------- visualizations:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# --------------------------------------


# ---------------------------------------
import sklearn
from sklearn import preprocessing, metrics, pipeline, model_selection, feature_extraction 
from sklearn import naive_bayes, linear_model, svm, neural_network, neighbors, tree
from sklearn import decomposition, cluster

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron, SGDClassifier, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# ---------------------------------------


# ----------------- output and visualizations: 
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
# show several prints in one cell. This will allow us to condence every trick in one cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# ---------------------------------------

### Text analysis and String manipulation imports:

In [None]:
# --------------------------------------
# --------- Text analysis and Hebrew text analysis imports:
# vectorizers:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# regular expressions:
import re
# --------------------------------------

### (optional) Hebrew text analysis - WordNet (for Hebrew)
Note: the WordNet is not a must

#### (optional) Only if you didn't install Wordnet (for Hebrew) use:

In [None]:
# word net installation:

# unmark if you want to use and need to install
# !pip install wn
# !python -m wn download omw-he:1.4

In [None]:
# word net import:

# unmark if you want to use:
# import wn

### (optional) Hebrew text analysis - hebrew_tokenizer (Tokenizer for Hebrew)
Note: the hebrew_tokenizer is not a must

#### (optional) Only if you didn't install hebrew_tokenizer use:

In [None]:
# Hebrew tokenizer installation:

# unmark if you want to use and need to install:
!pip install hebrew_tokenizer

In [None]:
# Hebrew tokenizer import:

# unmark if you want to use:
import hebrew_tokenizer as ht

### Reading input files
Reading input files for train annotated corpus (raw text data) corpus and for the test corpus

In [None]:
train_filename = 'annotated_corpus_for_train.csv'
test_filename  = 'corpus_for_test.csv'
df_train = pd.read_csv(train_filename, index_col=None, encoding='utf-8')
df_test  = pd.read_csv(test_filename, index_col=None, encoding='utf-8')

In [None]:
df_train.head(8)
df_train.shape

In [None]:
df_test.head(3)
df_test.shape

### Your implementation:
Write your code solution in the following code-cells

In [None]:
# Method that remove all the unnecessary chars.
def remove_unnecessary_chars(series):
    preprocess_x = []

    for txt in list(series):
        txt = re.sub(r'[.,:]', '', txt) # Remove periods, commas and colons
        txt = re.sub(r'\b\w\b', '', txt)  # Removes single characters
        txt = re.sub(r'\s+', ' ', txt)  # Replaces multiple spaces with a single space
        preprocess_x.append(txt)

    return preprocess_x

In [None]:
# Method that make sure that our story contain only hebrew words
def hebrew_text(story):
    list_of_the_tokens = []
    all_the_tokens = ht.tokenize(story)

    for grp, token, token_num, (start_index, end_index) in all_the_tokens:
        if grp == 'HEBREW':
            list_of_the_tokens.append(token)

    return list_of_the_tokens

In [None]:
# Remove unnecessary chars from the stories by using regex
X = remove_unnecessary_chars(df_train.story)

In [None]:
# Coverts gender from [m, f] to [0,1] to ignore errors when I fit the y train
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(df_train.gender)

In [None]:
# settings for the running

from enum import Enum

class VectOptions(Enum):
    TfidfVectorizer = 1,
    CountVectorizer = 2

class ModelOptions(Enum):
    DecisionTreeClassifier = 1
    KNeighborsClassifier = 2
    MultinomialNB = 3

ngram_list = [(1, 1), (1, 2), (1, 3)]
min_df = [3, 5 ,7]
max_features = [10000, 15000, 20000]
scoring = make_scorer(f1_score, average='macro')
cv = 10

In [None]:
# model options: DecisionTreeClassifier, KNeighborsClassifier and MultinomialNB

def get_model_option(model_option):
    if model_option == ModelOptions.DecisionTreeClassifier:
        return DecisionTreeClassifier()
    elif model_option == ModelOptions.KNeighborsClassifier:
        return KNeighborsClassifier()
    return MultinomialNB()

In [None]:
# Cross validation result accuracy

def get_cross_validation(model, X, y):
    f1_scores = cross_val_score(model, X, y, cv=cv)
    return f1_scores.mean()

In [None]:
# F1 score with cross validation

def get_f1_average_score(model, X, y):
    f1_scores = cross_val_score(model, X, y, scoring=scoring, cv=cv)
    return f1_scores.mean()

In [None]:
# Get results

def get_results(model_option, X, y):
    clf = get_model_option(model_option)
    clf.fit(X, y)
    accuracy_cross_validation = get_cross_validation(clf, X, y)
    f1_average_score_cross_validation = get_f1_average_score(clf, X, y)
    param_grid = get_param_grid(model_option)
    grid_search = GridSearchCV(clf, param_grid=param_grid, scoring=scoring, cv=cv)
    grid_search.fit(X, y)
    grid_search_f1_average_score_cross_validation_parameters = grid_search.best_params_
    grid_search_f1_average_score_cross_validation = grid_search.best_score_

    return accuracy_cross_validation, \
           f1_average_score_cross_validation, \
           grid_search_f1_average_score_cross_validation, \
           grid_search_f1_average_score_cross_validation_parameters, \
           clf

In [None]:
# get the param grid by the model

def get_param_grid(model_option):
    if model_option == ModelOptions.DecisionTreeClassifier:
        return {
            'max_depth': [None, 5, 10, 15, 20]
        }
    elif model_option == ModelOptions.KNeighborsClassifier:
        return {
            'n_neighbors': [3, 5, 7, 9]
        }
    return {
        'alpha': [1, 5, 10, 15],
        'fit_prior': [False, True]
    }

In [None]:
# Create vectorizer

def vectorizer(vect_option, max_feature, ngram, min_d):
    vect = CountVectorizer(tokenizer=hebrew_text, analyzer='word', max_features=max_feature, ngram_range=ngram, min_df=min_d, decode_error='ignore')

    if vect_option == VectOptions.TfidfVectorizer:
        vect = TfidfVectorizer(tokenizer=hebrew_text, analyzer='word', max_features=max_feature, ngram_range=ngram, min_df=min_d, decode_error='ignore')

    return vect

In [None]:
# Create vectorizer and normalize

def vectorizer_and_normalize(vect_option, X, max_feature, ngram, min_d):
    vect = vectorizer(vect_option, max_feature, ngram, min_d)
    vect.fit(X)
    vect_X = vect.fit_transform(X)
    vect_X = preprocessing.normalize(vect_X, norm='l2')

    return vect_X, vect

In [None]:
best_score_f1_average = 0
best_score_grid_search = 0

for max_feature in max_features:
    print("##########################################")
    print(f"The max feature is {max_feature}")

    for ngram in ngram_list:
        print(f"The ngram is {ngram}")

        for min_d in min_df:
            print(f"The min dataframe is {min_d}")

            for vect_option in VectOptions:
                print(f"The vect option is {vect_option}")

                vect_X, vect = vectorizer_and_normalize(vect_option, X, max_feature, ngram, min_d)

                for model_option in ModelOptions:
                    print(f"The model is {model_option}")

                    accuracy_cross_validation, \
                    f1_average_score_cross_validation, \
                    grid_search_f1_average_score_cross_validation, \
                    grid_search_f1_average_score_cross_validation_parameters, \
                    clf = get_results(model_option, vect_X, y)

                    print(f"Cross validation results accuracy: {accuracy_cross_validation}")
                    print(f"Average F1 score: {f1_average_score_cross_validation}")

                    if f1_average_score_cross_validation > best_score_f1_average:
                        best_score_f1_average = f1_average_score_cross_validation

                    print(f"F1 average macro on grid search best Score for {model_option} is {grid_search_f1_average_score_cross_validation}")

                    if grid_search_f1_average_score_cross_validation > best_score_grid_search:
                        best_score_grid_search = grid_search_f1_average_score_cross_validation # Saving the best score
                        best_model = clf # Saving the best model for prediction
                        best_max_feature = max_feature # Saving the best max feature
                        best_ngram = ngram # Saving the best ngram
                        best_min_d = min_d # Saving the best min df
                        best_vect = vect # Saving the best vect
                        best_parameters = grid_search_f1_average_score_cross_validation_parameters # Saving the best parameters

    print("##########################################")

In [None]:
print(f"The best F1 average score is {best_score_f1_average}")

In [None]:
print(f"The best grid search score is {best_score_grid_search}, "
      f"the model is {best_model} "
      f"with the ngram = {best_ngram}, "
      f"min_df = {best_min_d} and "
      f"max_feature = {best_max_feature} "
      f"best parameters = {best_parameters}")

In [None]:
predictions = best_model.predict(best_vect.transform(df_test.story))
df_predict = pd.DataFrame({'test_example_id': df_test.test_example_id, 'predictions': predictions})
df_predict.head()
df_predict.tail()