## Encoding
### CountVectorizer

In [0]:
import pandas as pd
import numpy as np
import html
import string
import re
from datasets import load_dataset

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
text = [
    "Hi there and hi",
    "today we are talking about text and its treatment by computer",
    "you are all experts of nlp after today",
]
coun_vect = CountVectorizer(stop_words="english")
count_matrix = coun_vect.fit_transform(text)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array, columns=coun_vect.get_feature_names())

In [0]:
print(df)

## Import dataset

Large Movie Review Dataset. 

This is a dataset for binary sentiment classification containing
a set of 25,000 highly polar movie reviews for training, and 25,000 for testing.

In [0]:
!git lfs install
!git clone https://huggingface.co/datasets/imdb

In [0]:
# let's load datasets
dataset_train = load_dataset("imdb", split="train")
dataset_test = load_dataset("imdb", split="test")

In [0]:
#shuffle dataset
dataset_train = dataset_train.shuffle(seed=42)

In [0]:
df_train = pd.DataFrame(
    list(zip(dataset_train["text"], dataset_train["label"])), columns=["text", "label"]
)
df_test = pd.DataFrame(
    list(zip(dataset_test["text"], dataset_test["label"])), columns=["text", "label"]
)

In [0]:
print("Text example:")
print()
print(df_train["text"][0])

In [0]:
# sentiment count
df_train["label"].value_counts()

In [0]:
df_test["label"].value_counts()

## Text processing

In [0]:
def clean_text(text):
    """Cleans text: removes spaces, punctuation, html, special chars, file extenstions

    :param text: text, single observation 
    :type text: string
    :return: cleaned text
    :rtype: string
    """

    # Formatage
    text = re.sub(r"\\\S+", r'', text, flags=re.MULTILINE)  # \\xe2\\x80\\xb
    # fix_text('uÌˆnicode') -> ünicode
    text = html.unescape(text)  # Fix unescape html special caracters

    text = text.replace("\n", " ").replace(
        "\r", ""
    )  # replace line breaks with spaces, removes carriage returns

    text = text.lower()

    # common Links
    text = re.sub(r"http\S*", "", text, flags=re.MULTILINE)

    # punctuation
    text = re.sub(r"(\S)[#,.+*](\S)", r"\1 \2", text, flags=re.MULTILINE)

    # no digits
    text = text.translate(str.maketrans(
        string.digits, " " * len(string.digits)))

    text = re.sub(r"[(){}\"'’,:.;@#?!&%«»[]$/\\]+\ *", " ", text)

    text = text.strip()  # Strip spaces

    text = re.sub(
        r"\bhtml\b|\bphp\b|\btoo\b", " ", text
    )  # remove file extensions

    text = " ".join(text.split())  # remove multiple spaces

    return text

In [0]:
for d in [df_train, df_test]:
    d["text"] = d["text"].apply(lambda x: clean_text(x))

#### Modelling

In [0]:
import nltk
import sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import f1_score
from time import time


In [0]:
from nltk.corpus import stopwords

In [0]:
nltk.download("stopwords")

In [0]:
stopwords_list = stopwords.words("english")
# Shuffle your dataset
shuffle_df = df_train.sample(frac=1).reset_index(drop=True)

y_train, t_train = df_train["label"], df_train["text"]
y_test, t_test  = df_test["label"], df_test["text"]
#let's define a vectorizer

vectorizer = CountVectorizer(
        min_df=10,
        ngram_range=(1, 1),
        max_df=0.8,
        stop_words=stopwords_list,
        lowercase=True,
    )
X_train = vectorizer.fit_transform(t_train)
X_test = vectorizer.transform(t_test)

In [0]:
def get_data(train_set, test_set):
    """stratifies and splits the data, vectorizes inputs

    :param df: dataframe where col[0] contains text and the rest of columns are labels
    :return: train/test dataset, vectorizer

    """

    stopwords_list = stopwords.words("english")

    t0 = time()
    y_train, t_train = train_set["label"], train_set["text"]

    y_test, t_test  = test_set["label"], test_set["text"]
    print("Time for train/test split = %4.2f seconds" % (time() - t0))
    
    vectorizer = CountVectorizer(
        min_df=10,
        ngram_range=(1, 1),
        max_df=0.8,
        stop_words=stopwords_list,
        lowercase=True,
    )

    X_train = vectorizer.fit_transform(t_train)
    X_test = vectorizer.transform(t_test)
    print("Time to vectorize texts= %4.2f seconds" % (time() - t0))
    return X_train, y_train, X_test, y_test, vectorizer




In [0]:
# How big is our vocabulary?
X_train

In [0]:
def train_test_model(classifier, X_train, X_test, y_train, y_test):
    """trains a model with cross validation , tests on the test data set

    :param model: pipeline
    :type model: pickle
    :param X_train: train set
    :param X_test: test set
    :param y_train: train labels
    :param y_test: test labels
    :return: evaluation metrics

    """
    k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
    t0 = time()
    classifier.fit(X_train, y_train)  # train
    time_train = time() - t0
    t0 = time()
    train_predicted = classifier.predict(X_train)
    predicted = classifier.predict(X_test)  # predict
    time_pred = time() - t0
    scores = cross_validate(classifier, X_train, y_train, scoring="f1", cv=k_fold)
    time_pred = time() - t0
    f1_test = f1_score(y_test, predicted)
    score, sdt = np.mean(list(scores["test_score"])), np.std(list(scores["test_score"]))
          
    #print("Test recall: {}".format(recall))
    # print(multilabel_confusion_matrix(y_test, predicted))
    return [score, f1_test, time_train, time_pred]


def benchmark_classifiers(CLF):
    """outputs the dataframe

    :param CLF: dictionary of classifiers
    :param y_test: dict
    :return: dataframe with metric for each classifier

    """
    results = []
    for c in CLF:
        results.append([c] + train_test_model(CLF[c], X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test))
    cols = [
        "Classifier",
        "F1 Score cv",
        "F1 Score",
        "Train time",
        "Test time",
    ]
    resDF = pd.DataFrame(results, columns=cols)
    return resDF.round(decimals=2)

In [0]:
CLF = {}
CLF["SGDl1"] = SGDClassifier(
        penalty="elasticnet", n_jobs=-1, loss="hinge", class_weight="balanced", alpha=0.00001
    )
CLF["SGDl2"] = SGDClassifier(
        penalty="l2", n_jobs=-1, loss="modified_huber", class_weight="balanced")

CLF["SGD3"] = SGDClassifier(
                        learning_rate="optimal",
                        loss="log")
CLF["SGDl4"] = SGDClassifier( n_jobs=-1,class_weight="balanced")
resDF = benchmark_classifiers(CLF)
print(resDF)

### TfidfVectorizer

### A vous de jouer !

In [0]:
#TO DO
#vectorizer = 

In [0]:
X_train = vectorizer.fit_transform(t_train)
X_test = vectorizer.transform(t_test)

In [0]:
resDF = benchmark_classifiers(CLF)
print(resDF)

### Let's check how it works

In [0]:
import lime
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import Pipeline

In [0]:
## Refit model based on parameter settings
pipeline = Pipeline([
  ('vectorizer',vectorizer), 
  ('clf', SGDClassifier( n_jobs=-1,class_weight="balanced", loss="log"))])
pipeline.fit(df_train["text"], df_train["label"])

### Let's test the model's logic

In [0]:
import textwrap
reviews_test = df_test["text"]
sentiments_test = df_test["label"]



# We choose a sample from test set
idx = 210
text_sample = reviews_test[idx]
class_names = ['negative', 'positive']

print('Review ID-{}:'.format(idx))
print('-'*50)
print('Review Text:\n', textwrap.fill(text_sample,400))
print('-'*50)
print('Probability(positive) =', pipeline.predict_proba([text_sample])[0,1])
print('Probability(negative) =', pipeline.predict_proba([text_sample])[0,0])
print('Predicted class: %s' % pipeline.predict([text_sample]))
print('True class: %s' % sentiments_test[idx])

In [0]:
import matplotlib
matplotlib.rcParams['figure.dpi']=300
%matplotlib inline


explainer = LimeTextExplainer(class_names=class_names)
explanation = explainer.explain_instance(text_sample, 
                                         pipeline.predict_proba, 
                                         num_features=20)
explanation.show_in_notebook(text=True)

In [0]:
del pipeline

### Transformers

In [0]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", truncation=True)

In [0]:
classifier("We hope you enjoy this tutorial.")

In [0]:
results = classifier(list(df_test["text"][:10].values))
predicted = [1 if result['label']=="POSITIVE" else 0 for result in results]

In [0]:
# let's check how good is Transformer
print("Predicted", predicted)
print("Actual", df_test["label"][:10])

### A vous de jouer !

In [0]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# choose the model from HuggingFace Hub
# follow https://huggingface.co/models?language=en&pipeline_tag=text-classification&sort=downloads

# model_name = choose the model from HuggingFace Hub
# example: model_name = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer_loaded = AutoTokenizer.from_pretrained(
    model_name
)
model_loaded = AutoModelForSequenceClassification.from_pretrained(
    model_name
)
nlp = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer_loaded)
text = clean_text("We hope you've enjoyed this tutorial.")
results = nlp(text)

In [0]:
print(results)