In [None]:
#install the packages 
#%pip install nltk
#%pip install polars
#%pip install gensim
#%pip install emoji

#this is required for the first time
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
#load the packages
import polars as pl
#make sure that this is the main file
import sys
import os
project_root = os.getcwd()
sys.path.append(project_root)

### Training Dataset


In [None]:
# keep you training dataset in the training data folder
# this template uses csv files
# for the demo, the response and the sentiments are named Response and Sentiment respectively in the csv file. 
# column names can be set in Python but this template does not automatically update the column for the demo
# however, the function will give you the option to tell column names for the text and label data

df_train = pl.read_csv("Training Data\Train.csv",encoding='ISO-8859-1') 
#Replace the name "Train.csv" with your file name | alternatively, rename your file name as "Train.csv"

### DEMO

In [4]:
df_train.head()

Response,Sentiment
str,str
"""[NAME] I handicap Disable disa…","""O"""
"""[NAME] I will clean the hospit…","""O"""
"""1. There are no clear instruct…","""N"""
"""2 times i left messages and ne…","""O"""
"""A bit confusing st first, but…","""P"""


In [5]:
# here I have two python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
import importlib
from tools.preprocess import pre_process
#this function will run the sentimental analysis in the training data and return the results
from tools.pipeline import run_pipeline
# this function will run the sentimental analysis in the new data and return the predictions
from tools.predict import predict_pipeline



tools.preprocess module loaded
Functions available in module: ['WordNetLemmatizer', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'emoji', 'lemmatizer', 'numpy', 'pre_process', 're', 'remove_emojis', 'remove_extra_spaces', 'remove_html_tags', 'remove_numbers', 'remove_punctuation_from_token', 'remove_square_brackets', 'remove_urls_emails', 'simple_tokenizer', 'stop_words', 'stopwords', 'string', 'word_tokenize']


In [6]:
# you can use the pre_process function to clean the text data
df_train["Response"][0] , pre_process(df_train["Response"][0]) , pre_process(df_train["Response"][0],remove_brackets= True )

("[NAME] I handicap Disable disabled Special I work Aha says it's a [ADDRESS] [PHONE NUMBER]",
 'name handicap disable disabled special work aha say address phone number',
 'handicap disable disabled special work aha say')

In [7]:
# make changes as necessary
# inside the map_elements, add  the parameters [pre_process(x, parameters_to_be_added)] and set it True/False if it differs from the defualt value
df_train = df_train.with_columns(
    pl.col("Response").map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
df_train.head(10)
#output might show warnings, it usually is not any problem

  df_train = df_train.with_columns(


Response,Sentiment,processed
str,str,str
"""[NAME] I handicap Disable disa…","""O""","""handicap disable disabled spec…"
"""[NAME] I will clean the hospit…","""O""","""clean hospital hide hospital s…"
"""1. There are no clear instruct…","""N""","""1 clear instruction extending …"
"""2 times i left messages and ne…","""O""","""2 time left message never rece…"
"""A bit confusing st first, but…","""P""","""bit confusing st first using t…"
"""A excellent process that helps…","""P""","""excellent process help navigat…"
"""A little confused with checkin…","""O""","""little confused checking thru …"
"""A little hard to navigate""","""N""","""little hard navigate"""
"""A lot of the time the website …","""N""","""lot time website website easy …"
"""A surprisingly easy website to…","""P""","""surprisingly easy website navi…"


In [None]:
#### in this template, there are four text representation / vectorizer methods available 
#### #in the function run_pipeline, we shall make use of this, write the words inside [ ] for the methods you want to use
#### 1. Bag of Words [BOW] 
#### 2. Term Frequency [tf]
#### 3. TF -IDF    [tfidf]
#### 4. Word Embedding using Word2Vec (you can use other packages with slight changes) [wv] 
         # Word Embedding uses defualt 300 values; this will take some time to run

In [None]:
#### in this template, there are also three machine learning methods that can be used
#### 1. Logistic Regression [logit]
#### 2. Random forest (recommended) (rf)
#### 3. XGBoosting  [XGB](word embedding and XGBoost may take long time to complete, combination of both is not recommended in local machine)

#I will keep this repository updated, and I will add more methods in the future

In [8]:
# this is the example of how to use the function
# you can change the vectorizer_name and model_name to the ones you want to use
# for now we will use word embedding and logistic regression
# write the name of your columns in the text_column_name and sentiment_column_name
# the text_column_name is the column name of the text data, and sentiment_column_name is

# run_pipeline function will return the dataframe with the vectorized text, vectorizer used  and the model
# it will also print the results of the model, including the accuracy and F1 score
dt= run_pipeline(
    vectorizer_name="wv", # BOW, tf, tfidf, wv
    model_name="logit", # logit, rf, XGB .#XGB takes long time, can not recommend using it on normal case
    df=df_train,
    text_column_name="processed",  # this is the column name of the text data, 
    sentiment_column_name = "Sentiment"  # this is the column name of the label data,
)


--- Running Pipeline for Wv + Logit ---
Labels encoded: Original -> ['N' 'O' 'P'], Encoded -> [0 1 2]
1. Vectorizing entire dataset (X)...
Loading pre-trained word2vec-google-news-300 model (this may take a few minutes)...
Word2Vec model loaded.
2. Splitting data into train/test...
3. Training and predicting...
   - Starting Logistic Regression training with GridSearchCV for hyperparameter tuning...
Fitting 5 folds for each of 24 candidates, totalling 120 fits





   - Best Hyperparameters found:
{'C': 1.0, 'class_weight': 'balanced', 'max_iter': 500, 'solver': 'liblinear'}
   - Best Cross-Validation Score (F1-weighted): 0.7444
Best model parameters: {'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
4. Evaluating model...

Classification Report:
              precision    recall  f1-score   support

           N       0.83      0.92      0.87       183
           O       0.56      0.36      0.44        50
           P       0.82      0.80      0.81        59

    accuracy                           0.80       292
   macro avg       0.74      0.69      0.71       292
weighted avg       0.78      0.80      0.78       292

True labels distribution: Counter({0: 183, 2: 59, 1: 50})
Predicted labels distribution: C

In [9]:
## the dt is a dictionary that contains the results of the model, including the accuracy and F1 score
print(dt.keys())
# you can access the results using the keys of the dictionary
print("Vectorizer used: ", dt["vectorizer_name"])
print("Model used: ", dt["model_object"])
print("Accuracy: ", dt["accuracy"])


dict_keys(['model_object', 'vectorizer_name', 'vectorizer_object', 'label_encoder', 'y_test', 'y_pred', 'accuracy', 'report'])
Vectorizer used:  wv
Model used:  LogisticRegression(class_weight='balanced', max_iter=500, random_state=42,
                   solver='liblinear')
Accuracy:  0.797945205479452


### New Dataset for prediction
You can use the same format as the training dataset, but ensure that it contains the "Response" column for text data. The "Sentiment" column is optional for prediction datasets, as it will be generated by the model.
Make sure the dataset is saved in the "New Data" folder and is in CSV format.

In [None]:
new_data = pl.read_csv("New Data/test_142.csv",encoding='ISO-8859-1') #keep your file here
new_data.head(5)

  new_data = new_data.with_columns(


Response,Sentiment,processed
str,str,str
"""[EMPLOYEE'S NAME] was great, v…","""P""","""great personable helped unders…"
"""[EMPLOYEE'S NAME] was so very …","""P""","""helpful called back twice went…"
"""[EMPLOYEE'S NAME] was very hel…","""P""","""helpful nice came visit"""
"""2 claims were filed because yo…","""N""","""2 claim filed employee gave wr…"
"""2 of 3 elevators were out of o…","""N""","""2 3 elevator order edd 6th flo…"


In [12]:
# DO NOT CHANGE THE CODE BELOW
from sklearn.preprocessing import LabelEncoder

vectorizer_func = dt["vectorizer_name"] 
ml_model=dt["model_object"]

new_data = new_data.with_columns(
    pl.Series(name="predictions", values=predict_pipeline(new_data, vectorizer_func, ml_model))
)

# Convert numeric predictions to letter labels
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
new_data = new_data.with_columns(
    pl.col("predictions").map_elements(lambda x: label_map.get(x, x)).alias("predictions_label")
)

new_data.head(25)  # Display the first 10 rows of the DataFrame with predictions


Using already loaded Word2Vec model.


  new_data = new_data.with_columns(


Response,Sentiment,processed,predictions,predictions_label
str,str,str,i64,str
"""[EMPLOYEE'S NAME] was great, v…","""P""","""great personable helped unders…",2,"""Positive"""
"""[EMPLOYEE'S NAME] was so very …","""P""","""helpful called back twice went…",2,"""Positive"""
"""[EMPLOYEE'S NAME] was very hel…","""P""","""helpful nice came visit""",2,"""Positive"""
"""2 claims were filed because yo…","""N""","""2 claim filed employee gave wr…",0,"""Negative"""
"""2 of 3 elevators were out of o…","""N""","""2 3 elevator order edd 6th flo…",0,"""Negative"""
…,…,…,…,…
"""Horrible customer service, the…","""N""","""horrible customer service whol…",0,"""Negative"""
"""I am to the office I left my p…","""N""","""office left phone number suppo…",0,"""Negative"""
"""I am trying to file a claim. I…","""N""","""trying file claim trying since…",0,"""Negative"""
"""I came to the office to turn i…","""N""","""came office turn document tele…",0,"""Negative"""
