In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.decomposition import TruncatedSVD

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string
import spacy
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# data

In [3]:
data = pd.read_csv('data/sp_data.csv')
data

Unnamed: 0,label,text
0,Caída de objetos,Sacar llaves de pozo. Se sacó llave.
1,Mantenimiento,Ventilador de cabina no funciona. Se desconect...
2,Suministro energía,El ascensor arrancó en el piso 23 y paró brusc...
3,"Golpe, Vandalismo",El ascensor está bloqueado. Abrieron la escoti...
4,Caída de objetos,Sacar llaves de pozo. Se sacó llave. Había caí...
...,...,...
37385,Material,ASCENSOR FUERA DE SERVICIO. Se encontro equipo...
37386,Material,ASCENSOR FUERA DE SERVICIO. Se encontro falla ...
37387,Funcionando Bien,CARGA FUERA DE SERVICIO. Se encontro equipo fu...
37388,Mantenimiento,ROSE EN CABINA. Se encontro equipo funcionando...


# Tokenization

In [4]:
def token_func(data):
    return nltk.word_tokenize(data)

# Stopwords

In [5]:
stop_words = set(stopwords.words('spanish'))

def pre_remove_stop_words(data):
    word_tokens = token_func(data)
    data = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(data)

def remove_stop_words(data):
  data['text'] = data['text'].apply(pre_remove_stop_words)
  return data


# Remove Ponctuation

In [6]:
def ponct_func(data):
  data = data.translate(str.maketrans('', '', string.punctuation))
  return data

def ponctuation_remov_func(data):
  data['text'] = data['text'].apply(ponct_func)
  return data

# Lowercase convertion

In [7]:
def lowercase_func(data):
    data['text'] = data['text'].str.lower()
    return data

# Stemming

In [8]:
stemmer = SnowballStemmer('spanish')

def preprocess_text(text):
    word_tokens = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(filtered_text)

def stemming_func(data):
    data['text'] = data['text'].apply(preprocess_text)
    return data

# Lemmatization

In [9]:
nlp = spacy.load('es_core_news_md')

# Function to preprocess and lemmatize text
def preprocess_and_lemmatize(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract lemmatized tokens and join them back into a string
    lemmatized_text = ' '.join([token.lemma_ for token in doc])

    return lemmatized_text

# Example function to apply preprocessing and lemmatization to a dataset column
def lemmatization_func(data):
    data['text'] = data['text'].apply(preprocess_and_lemmatize)
    return data

# Feature extraction TF_IDF

##### Total number of unique words in all the dataset, this number is 1,513,787 and 20,751 with lowercase
X_lower = lowercase_func(data)
X_lowe = X_lower['text']
token_data = X_lowe.apply(token_func)
flat_list = [token for sublist in token_data for token in sublist]
flat_list
print(len(flat_list))
unique_words = set(flat_list)
print(len(unique_words))

In [10]:
def TF_IDF_func(data, words_rate=0.1):
  words_rate = 0.3
  token_data = data['text'].apply(token_func)
  flat_list = [token for sublist in token_data for token in sublist]
  unique_words = len(set(flat_list))
  tfidf_vectorizer = TfidfVectorizer(max_features= int(unique_words * words_rate))

  data_tfidf = tfidf_vectorizer.fit_transform(data['text'])
  return data_tfidf, tfidf_vectorizer

# Feature selection LSI

In [11]:
def LSI_func(data,  num_topics = 30):
  tfidf_matrix, tfidf_vectorizer = TF_IDF_func(data)
  lsa = TruncatedSVD(n_components=num_topics)
  data_lsa = lsa.fit_transform(tfidf_matrix)
  return data_lsa, lsa, tfidf_vectorizer

# Models

In [12]:
def model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_jobs=-1)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    rdf_accuracy = accuracy_score(y_test, predictions)
    rdf_f1 = f1_score(y_test, predictions, average='weighted')
    return rdf_accuracy, rdf_f1


# Code

In [13]:
metrics_data = pd.DataFrame({})
metrics_data[['Description','num_features', 'pipe_time', 'f1_rdf', 'acc_rdf']]=""
metrics_data


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf


In [14]:
data.shape

(37390, 2)

In [15]:
def pipe_1(data):
    start_time = time.time()
    
    #preprocesing
    X, _ = TF_IDF_func(data, 1)
    y = data['label']
    description = 'tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_1(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339


In [16]:
def pipe_2(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_2(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403


In [17]:
def pipe_3(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_3(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536


In [18]:
def pipe_4(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_4(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249


In [19]:
def pipe_5(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-lemma-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_5(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647


In [20]:
def pipe_6(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = stemming_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-stemm-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    

    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_6(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026


In [21]:
def pipe_7(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    df = stemming_func(df)
    X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stopw-lemma-stemm-tfidf'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_7(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026
6,lower-ponct-stopw-lemma-stemm-tfidf,3535,153.683,0.720829,0.737363


In [22]:
def pipe_8(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_8(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026
6,lower-ponct-stopw-lemma-stemm-tfidf,3535,153.683,0.720829,0.737363
7,lower-ponct-tfidf-lsi70,70,7.656,0.611407,0.630516


In [23]:
def pipe_9(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_9(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026
6,lower-ponct-stopw-lemma-stemm-tfidf,3535,153.683,0.720829,0.737363
7,lower-ponct-tfidf-lsi70,70,7.656,0.611407,0.630516
8,lower-ponct-stop-tfidf-lsi70,70,9.508,0.629616,0.647499


In [24]:
def pipe_10(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    
    df = stemming_func(df)

    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-stemm-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_10(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026
6,lower-ponct-stopw-lemma-stemm-tfidf,3535,153.683,0.720829,0.737363
7,lower-ponct-tfidf-lsi70,70,7.656,0.611407,0.630516
8,lower-ponct-stop-tfidf-lsi70,70,9.508,0.629616,0.647499
9,lower-ponct-stop-stemm-tfidf-lsi70,70,18.218,0.660078,0.676919


In [25]:
def pipe_11(data):
    start_time = time.time()
    
    #preprocesing
    df = data.copy()
    df = lowercase_func(df)
    df = ponctuation_remov_func(df)
    df = remove_stop_words(df)
    df = lemmatization_func(df)
    df = stemming_func(df)

    X, nan1, nan2 = LSI_func(df, 70)
    #X, _ = TF_IDF_func(df, 1)
    y = df['label']
    description = 'lower-ponct-stop-lemma-stemm-tfidf-lsi70'
    
    # call the model
    acc_rdf, f1_rdf = model(X, y)
    
    end_time = time.time()

    num_features = X.shape[1]
    pipe_time = end_time - start_time
    pipe_time = format(pipe_time, '.3f')
    
    metrics_data.loc[len(metrics_data.index)] = [description, num_features,  pipe_time,  f1_rdf,  acc_rdf]

pipe_11(data)
metrics_data

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Description,num_features,pipe_time,f1_rdf,acc_rdf
0,tfidf,8598,11.629,0.686943,0.706339
1,lower-tfidf,6225,11.076,0.685206,0.705403
2,lower-ponct-tfidf,6193,8.697,0.686105,0.705536
3,lower-ponct-stopw-tfidf,6142,11.36,0.695123,0.71249
4,lower-ponct-stopw-lemma-tfidf,4946,148.964,0.703538,0.720647
5,lower-ponct-stopw-stemm-tfidf,3592,19.444,0.720057,0.736026
6,lower-ponct-stopw-lemma-stemm-tfidf,3535,153.683,0.720829,0.737363
7,lower-ponct-tfidf-lsi70,70,7.656,0.611407,0.630516
8,lower-ponct-stop-tfidf-lsi70,70,9.508,0.629616,0.647499
9,lower-ponct-stop-stemm-tfidf-lsi70,70,18.218,0.660078,0.676919


In [27]:
metrics_data.to_csv('metric_data_sp_tfidf30.csv', index=False)