In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.decomposition import TruncatedSVD

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string
import spacy
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from pandarallel import pandarallel
from tqdm.notebook import tqdm


In [15]:
nltk.download('punkt')
nltk.download('stopwords')

pandarallel.initialize(progress_bar=True)
tqdm.pandas()

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# data

In [16]:
data = pd.read_csv('data/en_data.csv')
data.drop('text', axis=1, inplace=True)
data.rename(columns={'value_en':'text'},inplace=True)
data

Unnamed: 0,label,text
0,Caída de objetos,Take out well keys. Key was removed.
1,Mantenimiento,Cabin fan does not work. Cable was disconnecte...
2,Suministro energía,The elevator started on the 23rd floor and sto...
3,"Golpe, Vandalismo",The elevator is blocked. They opened the hatch...
4,Caída de objetos,Take out well keys. Key was removed. She had f...
...,...,...
37385,Material,ELEVATOR OUT OF SERVICE. The equipment was fou...
37386,Material,ELEVATOR OUT OF SERVICE. An intermittent failu...
37387,Funcionando Bien,LOAD OUT OF SERVICE. Equipment was found worki...
37388,Mantenimiento,ROSE IN CABIN. Equipment was found operating w...


# Tokenization

In [17]:
def token_func(data):
    return nltk.word_tokenize(data)

# Stopwords

In [18]:
stop_words = set(stopwords.words('english'))

def pre_remove_stop_words(data):
    word_tokens = token_func(data)
    data = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(data)

def remove_stop_words(data):
  data['text'] = data['text'].parallel_apply(pre_remove_stop_words)
  return data


# Remove Ponctuation

In [19]:
def ponct_func(data):
  data = data.translate(str.maketrans('', '', string.punctuation))
  return data

def ponctuation_remov_func(data):
  data['text'] = data['text'].parallel_apply(ponct_func)
  return data

# Lowercase convertion

In [20]:
def lowercase_func(data):
    data['text'] = data['text'].str.lower()
    return data

# Stemming

In [21]:
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    word_tokens = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(filtered_text)

def stemming_func(data):
    data['text'] = data['text'].parallel_apply(preprocess_text)
    return data

# Lemmatization

In [22]:
nlp = spacy.load('en_core_web_md')

# Function to preprocess and lemmatize text
def preprocess_and_lemmatize(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract lemmatized tokens and join them back into a string
    lemmatized_text = ' '.join([token.lemma_ for token in doc])

    return lemmatized_text

# Example function to apply preprocessing and lemmatization to a dataset column
def lemmatization_func(data):
    data['text'] = data['text'].parallel_apply(preprocess_and_lemmatize)
    return data

# Feature extraction TF_IDF

##### Total number of unique words in all the dataset, this number is 1,513,787 and 20,751 with lowercase
X_lower = lowercase_func(data)
X_lowe = X_lower['text']
token_data = X_lowe.apply(token_func)
flat_list = [token for sublist in token_data for token in sublist]
flat_list
print(len(flat_list))
unique_words = set(flat_list)
print(len(unique_words))

In [23]:
def TF_IDF_func(data, words_rate=0.1):
  token_data = data['text'].apply(token_func)
  flat_list = [token for sublist in token_data for token in sublist]
  unique_words = len(set(flat_list))
  tfidf_vectorizer = TfidfVectorizer(max_features= int(unique_words * words_rate))

  data_tfidf = tfidf_vectorizer.fit_transform(data['text'])
  return data_tfidf, tfidf_vectorizer

# Feature selection LSI

In [24]:
def LSI_func(data,  num_topics = 30):
  tfidf_matrix, tfidf_vectorizer = TF_IDF_func(data)
  lsa = TruncatedSVD(n_components=num_topics)
  data_lsa = lsa.fit_transform(tfidf_matrix)
  return data_lsa, lsa, tfidf_vectorizer

# Models

In [25]:
def model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_jobs=-1)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    rdf_accuracy = accuracy_score(y_test, predictions)
    rdf_f1 = f1_score(y_test, predictions, average='weighted')
    return rdf_accuracy, rdf_f1


# Code

In [26]:
metrics_data = pd.DataFrame({})
metrics_data[['Description','num_features', 'pipe_time', 'f1_rdf', 'acc_rdf']]=""
metrics_data


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf


In [27]:
data.shape

(37390, 2)

In [28]:
def pipe_1(data):
    start_time = time.time()
    
    #preprocesing
    X, _ = TF_IDF_func(data, 1)
    y = data['label']
    description = 'tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_1(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891


In [29]:
def pipe_2(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_2(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635


In [30]:
def pipe_3(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_3(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426


In [31]:
def pipe_4(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_4(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386


In [32]:
def pipe_5(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-lemma-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_5(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607


In [33]:
def pipe_6(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = stemming_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-stemm-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_6(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154


In [34]:
def pipe_7(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    df = stemming_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-lemma-stemm-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_7(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154
6,lower-ponct-stopw-lemma-stemm-tfidf,7864,49.424,0.715693,0.732415


In [35]:
def pipe_8(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_8(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154
6,lower-ponct-stopw-lemma-stemm-tfidf,7864,49.424,0.715693,0.732415
7,lower-ponct-tfidf-lsi70,70,8.427,0.61306,0.634795


In [36]:
def pipe_9(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_9(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154
6,lower-ponct-stopw-lemma-stemm-tfidf,7864,49.424,0.715693,0.732415
7,lower-ponct-tfidf-lsi70,70,8.427,0.61306,0.634795
8,lower-ponct-stop-tfidf-lsi70,70,8.678,0.642178,0.660337


In [37]:
def pipe_10(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    
    df = stemming_func(df)

    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-stemm-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_10(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154
6,lower-ponct-stopw-lemma-stemm-tfidf,7864,49.424,0.715693,0.732415
7,lower-ponct-tfidf-lsi70,70,8.427,0.61306,0.634795
8,lower-ponct-stop-tfidf-lsi70,70,8.678,0.642178,0.660337
9,lower-ponct-stop-stemm-tfidf-lsi70,70,10.591,0.661436,0.676518


In [38]:
def pipe_11(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    df = stemming_func(df)

    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-lemma-stemm-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_11(data)
metrics_data

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2671), Label(value='0 / 2671'))), …

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,10550,11.541,0.693313,0.712891
1,lower-tfidf,10550,11.589,0.697086,0.716635
2,lower-ponct-tfidf,11241,10.198,0.693648,0.713426
3,lower-ponct-stopw-tfidf,11143,12.109,0.705459,0.722386
4,lower-ponct-stopw-lemma-tfidf,9368,48.515,0.712701,0.729607
5,lower-ponct-stopw-stemm-tfidf,7978,12.423,0.717836,0.734154
6,lower-ponct-stopw-lemma-stemm-tfidf,7864,49.424,0.715693,0.732415
7,lower-ponct-tfidf-lsi70,70,8.427,0.61306,0.634795
8,lower-ponct-stop-tfidf-lsi70,70,8.678,0.642178,0.660337
9,lower-ponct-stop-stemm-tfidf-lsi70,70,10.591,0.661436,0.676518


In [39]:
metrics_data.to_csv('parallel_metric_data_en_tfidf100.csv', index=False)

In [41]:
print(metrics_data.head(10))

In [42]:
metric = pd.read_csv('parallel_metric_data_en_tfidf100.csv')

In [43]:
metric