In [1]:
#install the packages 
#%pip install nltk
#%pip install polars
#%pip install gensim
#%pip install emoji

In [2]:
#load the packages
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
import polars as pl
from sklearn.metrics import f1_score
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#make sure that this is the main file
import sys
import os
project_root = os.getcwd()
sys.path.append(project_root)

### Training Dataset
Please ensure that the training dataset is properly formatted and contains the necessary columns for sentiment analysis, such as "Response" for text data and "Sentiment" for labels. The dataset should be a located in Training Data folder and must be csv file.

In [4]:
#Do not change the code
#keep you training dataset in the training data folder
#this template uses csv files
#please rename the columns to Response and Sentiment in the csv file. column names can be set in Python but this template does not automatically update the column names at any point

df_train = pl.read_csv("Training Data\Train.csv",encoding='ISO-8859-1') 
#Replace the name "Train.csv" with your file name | alternatively, rename your file name as "Train.csv"

In [5]:
df_train.head()

Response,Sentiment
str,str
"""[NAME] I handicap Disable disa…","""O"""
"""[NAME] I will clean the hospit…","""O"""
"""1. There are no clear instruct…","""N"""
"""2 times i left messages and ne…","""O"""
"""A bit confusing st first, but…","""P"""


In [6]:
# here I have two python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
import importlib
from tools.preprocess import pre_process
#this function will run the sentimental analysis in the training data and return the results
from tools.pipeline import run_pipeline

tools.preprocess module loaded
Functions available in module: ['WordNetLemmatizer', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'emoji', 'lemmatizer', 'numpy', 'pre_process', 're', 'remove_emojis', 'remove_extra_spaces', 'remove_html_tags', 'remove_numbers', 'remove_punctuation_from_token', 'remove_square_brackets', 'remove_urls_emails', 'simple_tokenizer', 'stop_words', 'stopwords', 'string', 'word_tokenize']


In [7]:
df_train["Response"][0] , pre_process(df_train["Response"][0]) , pre_process(df_train["Response"][0],remove_brackets= True )

("[NAME] I handicap Disable disabled Special I work Aha says it's a [ADDRESS] [PHONE NUMBER]",
 'name handicap disable disabled special work aha say address phone number',
 'handicap disable disabled special work aha say')

In [8]:
# make changes as necessary
# inside the map_elements, add  the parameters [pre_process(x, parameters_to_be_added)] and set it True/False if it differs from the defualt value
df_train = df_train.with_columns(
    pl.col("Response").map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)

df_train.head(10)
#output might show warnings, it usually is not any problem

  df_train = df_train.with_columns(


Response,Sentiment,processed
str,str,str
"""[NAME] I handicap Disable disa…","""O""","""handicap disable disabled spec…"
"""[NAME] I will clean the hospit…","""O""","""clean hospital hide hospital s…"
"""1. There are no clear instruct…","""N""","""1 clear instruction extending …"
"""2 times i left messages and ne…","""O""","""2 time left message never rece…"
"""A bit confusing st first, but…","""P""","""bit confusing st first using t…"
"""A excellent process that helps…","""P""","""excellent process help navigat…"
"""A little confused with checkin…","""O""","""little confused checking thru …"
"""A little hard to navigate""","""N""","""little hard navigate"""
"""A lot of the time the website …","""N""","""lot time website website easy …"
"""A surprisingly easy website to…","""P""","""surprisingly easy website navi…"


In [9]:
#### in this template, there are four text representation / vectorizer methods available 
#### #in the function run_pipeline, we shall make use of this, write the words inside [ ] for the methods you want to use
#### 1. Bag of Words [BOW] 
#### 2. Term Frequency [tf]
#### 3. TF -IDF    [tfidf]
#### 4. Word Embedding using Word2Vec (you can use other packages with slight changes) [wv] 
         # Word Embedding uses defualt 300 values; this will take some time to run

In [10]:
#### in this template, there are also three machine learning methods that can be used
#### 1. Logistic Regression [logit]
#### 2. Random forest (recommended) (rf)
#### 3. XGBoosting  [XGB](word embedding and XGBoost may take long time to complete, combination of both is not recommended in local machine)

#I will keep this repository updated, and I will add more methods in the future

In [18]:
#this is the example of how to use the function
#you can change the vectorizer_name and model_name to the ones you want to use
# for now we will use word embedding and logistic regression

# run_pipeline function will return the dataframe with the vectorized text, vectorizer used  and the model
#it will also print the results of the model, including the accuracy and F1 score
dt,vect,ml_model = run_pipeline(
    vectorizer_name="wv", # BOW, tf, tfidf, wv
    model_name="logit", # logit, rf, XGB .#XGB takes long time, can not recommend using it on normal case
    df=df_train
)


Loading pre-trained word2vec-google-news-300 model (this may take a few minutes)...
Word2Vec model loaded.
   - Starting Logistic Regression training with GridSearchCV for hyperparameter tuning...
Fitting 5 folds for each of 24 candidates, totalling 120 fits





   - Best Hyperparameters found:
{'C': 10.0, 'class_weight': 'balanced', 'max_iter': 500, 'solver': 'liblinear'}
   - Best Cross-Validation Score (F1-weighted): 0.7485
Best model parameters: {'C': 10.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       176
           1       0.53      0.34      0.42        47
           2       0.82      0.88      0.85        69

    accuracy                           0.81       292
   macro avg       0.73      0.71      0.71       292
weighted avg       0.79      0.81      0.80       292

Labels encoded: Original -> ['N' 'O' 'P'], Encoded -> [0 1 2]
True labels distribution: Counter({0: 176, 2: 69, 1: 47})
Predicted label

In [12]:
#View the dataframe after vectorization
pl.DataFrame(dt).head(10) #does not work when vectorizer_name is BOW 
#use the snippet below line

TypeError: DataFrame constructor called with unsupported type 'csr_matrix' for the `data` parameter

In [13]:
#works only for BOW
import numpy
dt_view = dt.toarray()
pl.DataFrame(dt_view)

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_3432,column_3433,column_3434,column_3435,column_3436,column_3437,column_3438,column_3439,column_3440,column_3441,column_3442,column_3443,column_3444,column_3445,column_3446,column_3447,column_3448,column_3449,column_3450,column_3451,column_3452,column_3453,column_3454,column_3455,column_3456,column_3457,column_3458,column_3459,column_3460,column_3461,column_3462,column_3463,column_3464,column_3465,column_3466,column_3467,column_3468
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### New Dataset for prediction
You can use the same format as the training dataset, but ensure that it contains the "Response" column for text data. The "Sentiment" column is optional for prediction datasets, as it will be generated by the model.
Make sure the dataset is saved in the "New Data" folder and is in CSV format.

In [14]:
new_data = pl.read_csv("New Data/test_142.csv",encoding='ISO-8859-1') #keep your file here

In [15]:
# Code to ensure the new data has the correct column name
# Can be changed on map_elements - pre_process() as you want
# Do not change anything else
first_col = new_data.columns[0]
new_data = new_data.rename({first_col: "Response"})
new_data = new_data.with_columns(
    pl.col("Response").map_elements(lambda x: pre_process(x, remove_brackets=True)).alias('processed') # processed text column like before
    #make sure to use the same pre_process function as before, with the same parameters (remove_brackets=True in this case
)
new_data.head(5)

  new_data = new_data.with_columns(


Response,Sentiment,processed
str,str,str
"""[EMPLOYEE'S NAME] was great, v…","""P""","""great personable helped unders…"
"""[EMPLOYEE'S NAME] was so very …","""P""","""helpful called back twice went…"
"""[EMPLOYEE'S NAME] was very hel…","""P""","""helpful nice came visit"""
"""2 claims were filed because yo…","""N""","""2 claim filed employee gave wr…"
"""2 of 3 elevators were out of o…","""N""","""2 3 elevator order edd 6th flo…"


In [26]:
def predict_pipeline(df, vectorizer_func, ml_model):
    """
    Predicts the sentiment of new data using the provided vectorizer and model functions.
    
    Parameters:
    - new_data: DataFrame containing the new data to predict.
    - vectorizer_func: Function to vectorize the text data.
    - model_func: Function to apply the trained model for prediction.
    
    Returns:
    - predictions: Array of predicted sentiments.
    """

        # Prepare data
    X_text = df["processed"].to_list()
    y_raw = df["Sentiment"].to_list()

    #XGBoost need Label Encoder
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_raw)
    
    # Vectorize the new data
    X_new = vectorizer_func(X_text)  # Passes the processed text to the vectorizer function, which returns the vectorized representation
    

    # This is new data, so we do not split it into train and test sets
    # Instead, we directly use the vectorized data for prediction
    # Note, we are not training the model here, and these data should not be used for training to avoid data leakage
    
    # Predict using the model
    predictions = ml_model.predict(X_new)
    new_predictions_original_labels = label_encoder.inverse_transform(predictions)
    
    return  new_predictions_original_labels

In [27]:
# DO NOT CHANGE THE CODE BELOW
from sklearn.preprocessing import LabelEncoder

predict_pipeline(new_data, vect, ml_model)
new_data = new_data.with_columns(
    pl.Series(name="predictions", values=predict_pipeline(new_data, vect, ml_model))
)

new_data.head(10)  # Display the first 10 rows of the DataFrame with predictions
# Add predictions to the DataFrame

Using already loaded Word2Vec model.
Using already loaded Word2Vec model.


Response,Sentiment,processed,predictions
str,str,str,str
"""[EMPLOYEE'S NAME] was great, v…","""P""","""great personable helped unders…","""P"""
"""[EMPLOYEE'S NAME] was so very …","""P""","""helpful called back twice went…","""P"""
"""[EMPLOYEE'S NAME] was very hel…","""P""","""helpful nice came visit""","""P"""
"""2 claims were filed because yo…","""N""","""2 claim filed employee gave wr…","""N"""
"""2 of 3 elevators were out of o…","""N""","""2 3 elevator order edd 6th flo…","""N"""
"""A pitiful, shame that in order…","""N""","""pitiful shame order forward mo…","""N"""
"""Almost impossible to get an ap…","""N""","""almost impossible get appointm…","""N"""
"""Asian representative was not c…","""N""","""asian representative cooperati…","""N"""
"""Cannot reach anyone through th…","""N""","""reach anyone phone system plea…","""N"""
"""clients are taking time away f…","""N""","""client taking time away care c…","""N"""
