In [1]:
import warnings
import sys
import os


warnings.filterwarnings('ignore')
current_dir = %pwd

parent_dir = os.path.abspath(os.path.join(current_dir, '../..'))
sys.path.append(parent_dir)

#os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
from src.main.pipeline.pipeline import Pipeline
from src.main.pipeline.functions import stop_words_removal, clean_text, remove_contractions, unify_numbers, tfidf_vectorizer
from src.main.utilities.utils import get_dataset
import numpy as np

2024-05-24 17:19:47.133283: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **Pipeline preprocessing** ###
The following section will show some sample sentences from the dataset to demonstrate the effects of the preprocessing pipeline. Each example will show the outcome of applying different preprocessing functions, such as tokenization, stopword removal, etc. This will illustrate how each step in the preprocessing pipeline transforms the text data. 


In [3]:

inputs, targets = get_dataset()
inputs = inputs[:200].reshape(-1, 1)
targets = targets[:200]

# Showing one text example for each class from the dataset
def print_results(inputs, targets):
    unique_classes = np.unique(targets)
    for class_name in unique_classes:
        class_index = np.where(targets == class_name)[0][0]
        print(f"Class: {class_name}")
        print(inputs[class_index][0])
        print()

print_results(inputs, targets)

Class: Entertainment
23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23) "Until you have a dog you don't understand what could be eaten."

Class: Life
6 Signs You’re Grinding Your Teeth At Night (And What To Do About It) Beyond toothaches, there are other common red flags that you're dealing with nighttime teeth grinding.

Class: Politics
Biden Says U.S. Forces Would Defend Taiwan If China Invaded President issues vow as tensions with China rise.

Class: Sports
Maury Wills, Base-Stealing Shortstop For Dodgers, Dies At 89 Maury Wills, who helped the Los Angeles Dodgers win three World Series titles with his base-stealing prowess, has died.

Class: Voices
Spirituality Has A New Face — And It’s Queer As Hell Meet three spiritual leaders working hard for queer people to have a safe space in the religious community.



In [5]:
# Removing contractions
pipeline = Pipeline([remove_contractions])

results = pipeline.execute(inputs)

print_results(results.reshape(-1, 1), targets)

Pipeline started


2024-05-24 17:20:01.045709: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:20:01.090604: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:20:01.105854: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:20:01.128834: I tensorflow/core/platform/cpu_featu

Pipeline execution time: 0:00:17.870852
Class: Entertainment
23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23) "Until you have a dog you do not understand what could be eaten."

Class: Life
6 Signs You are Grinding Your Teeth At Night (And What To Do About It) Beyond toothaches, there are other common red flags that you are dealing with nighttime teeth grinding.

Class: Politics
Biden Says YOU.S. Forces Would Defend Taiwan If China Invaded President issues vow as tensions with China rise.

Class: Sports
Maury Wills, Base-Stealing Shortstop For Dodgers, Dies At 89 Maury Wills, who helped the Los Angeles Dodgers win three World Series titles with his base-stealing prowess, has died.

Class: Voices
Spirituality Has A New Face — And It is Queer As Hell Meet three spiritual leaders working hard for queer people to have a safe space in the religious community.



In [7]:
# Cleaning the text
pipeline = Pipeline([remove_contractions, clean_text])

results = pipeline.execute(inputs)

print_results(results.reshape(-1, 1), targets)


Pipeline started


2024-05-24 17:41:27.043422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:41:27.046298: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:41:27.082374: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-24 17:41:27.099292: I tensorflow/core/platform/cpu_featu

Pipeline execution time: 0:00:38.175287
Class: Entertainment
23 of the funniest tweets about cats and dogs this week (sept  17 23)  until you have a dog you do not understand what could be eaten  

Class: Life
6 signs you are grinding your teeth at night (and what to do about it) beyond toothaches  there are other common red flags that you are dealing with nighttime teeth grinding 

Class: Politics
biden says you s  forces would defend taiwan if china invaded president issues vow as tensions with china rise 

Class: Sports
maury wills  base stealing shortstop for dodgers  dies at 89 maury wills  who helped the los angeles dodgers win three world series titles with his base stealing prowess  has died 

Class: Voices
spirituality has a new face  and it is queer as hell meet three spiritual leaders working hard for queer people to have a safe space in the religious community 



In [8]:
# Removing stop words
pipeline = Pipeline([remove_contractions, clean_text, stop_words_removal])

results = pipeline.execute(inputs)

print_results(results.reshape(-1, 1), targets)

Pipeline started
Pipeline execution time: 0:00:00.109043
Class: Entertainment
23 funniest tweets cats dogs week (sept  17 23)  dog understand could eaten  

Class: Life
6 signs grinding teeth night (and it) beyond toothaches  common red flags dealing nighttime teeth grinding 

Class: Politics
biden says  forces would defend taiwan china invaded president issues vow tensions china rise 

Class: Sports
maury wills  base stealing shortstop dodgers  dies 89 maury wills  helped los angeles dodgers win three world series titles base stealing prowess  died 

Class: Voices
spirituality new face  queer hell meet three spiritual leaders working hard queer people safe space religious community 



In [11]:
# Unifying numbers
pipeline = Pipeline([remove_contractions, clean_text, stop_words_removal, unify_numbers])

results = pipeline.execute(inputs)

print_results(results.reshape(-1, 1), targets)


Pipeline started
Pipeline execution time: 0:00:00.038857
Class: Entertainment
[NUM] funniest tweets cats dogs week (sept  [NUM] [NUM])  dog understand could eaten  

Class: Life
[NUM] signs grinding teeth night (and it) beyond toothaches  common red flags dealing nighttime teeth grinding 

Class: Politics
biden says  forces would defend taiwan china invaded president issues vow tensions china rise 

Class: Sports
maury wills  base stealing shortstop dodgers  dies [NUM] maury wills  helped los angeles dodgers win three world series titles base stealing prowess  died 

Class: Voices
spirituality new face  queer hell meet three spiritual leaders working hard queer people safe space religious community 



In [14]:
# TF-IDF Vectorization
pipeline = Pipeline([remove_contractions, clean_text, stop_words_removal, unify_numbers, tfidf_vectorizer])

results = pipeline.execute(inputs)

print(results)


Pipeline started
Pipeline execution time: 0:00:00.088347
[<200x2341 sparse matrix of type '<class 'numpy.float64'>'
 	with 3882 stored elements in Compressed Sparse Row format>]
