In [None]:

!pip install quick-sentiments
# just in case you want to install the package from the local directory
#pip install .\dist\quick_sentiments-0.2.0-py3-none-any.whl


In [None]:
import polars as pl

# here I have three python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
from quick_sentiments import pre_process
from quick_sentiments import run_pipeline
from quick_sentiments import make_predictions

### Training Dataset


Name your data as Train.csv and place it in the Training Data folder. Or you can change the path in the code below.


In [2]:
# keep you training dataset in the training data folder
# this template uses csv files 
# column names can be set in Python but this template does not automatically update the column for the demo 
# however, the function will give you the option to tell column names for the text and label data

df_train = pl.read_csv("demo/training_data/train.csv",encoding='ISO-8859-1') 
print(f"Dataset shape: {df_train.shape[0]} rows and {df_train.shape[1]} columns")


Dataset shape: 162758 rows and 5 columns


### DEMO

In [None]:
df_train.head()
# randomly select only 10% of the data since the dataset is large
#RUN ONLY ONCE
df_train = df_train.sample(fraction=0.1, shuffle=True, seed=42) 
df_train

The dataset is for training. The sentiments are already labeled. This will allow us to train a model that can predict sentiments on new data.


In [4]:
# you can use the pre_process function to clean the text data
response_column = "reviewText" # this is the column name for the text data, feel free to change it to your text column name
sentiment_column = "sentiment" # this is the column name for the sentiment data, feel free to change it to your sentiment column name
print(df_train[response_column][2])
print("\n" + pre_process(df_train[response_column][2]) )

In a year of many car movies, this one was the best (Living Life Fearless)

year many car movie one best living life fearless


In [5]:
# make changes as necessary
# inside the map_elements, add  the parameters [pre_process(x, parameters_to_be_added)] and set it True/False if it differs from the defualt value
# check the tools/preprocess.py file for the parameters and their default values
# some of the parameters are remove_brackets, remove_stopwords, remove_punctuation, remove_numbers, remove_emojis, remove_urls, remove_html_tags, lemmatize, stem, lowercase
df_train = df_train.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
df_train.head(10)

movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,processed
str,str,bool,str,str,str
"""john_mcclane_r…","""Deborah Farley…",False,"""..a cocksure, …","""POSITIVE""","""cocksure styli…"
"""tyler_durden_d…","""Margaret Marti…",False,"""A wild ride of…","""POSITIVE""","""wild ride movi…"
"""glimmer_mythic…","""Angel Peters""",False,"""In a year of m…","""POSITIVE""","""year many car …"
"""gandalf_journe…","""Alexandria Wil…",True,"""Festival will …","""POSITIVE""","""festival surel…"
"""vito_corleone_…","""Andrew Blanken…",False,"""The limp&#44; …","""NEGATIVE""","""limp 44 neglig…"
"""zephyr_scarlet…","""Lee Griffin""",False,"""The film does …","""POSITIVE""","""film weak spot…"
"""the_joker_ques…","""Brianna Flores…",True,"""A good-hearted…","""POSITIVE""","""goodhearted en…"
"""surreal_the_ca…","""Shelia Miller""",False,"""At this moment…","""POSITIVE""","""moment 500 day…"
"""dorothy_gale_g…","""Chelsea Martin…",False,"""Beats thrums w…","""POSITIVE""","""beat thrum unb…"
"""phenomenal_val…","""Mike Joseph""",False,"""A sequel that …","""POSITIVE""","""sequel narrati…"


In [None]:
#### in this template, there are four text representation / vectorizer methods available 
#### in the function run_pipeline (in python cell below), we shall make use of this, write the words inside [ ] for the methods you want to use
#### 1. Bag of Words [BOW] 
#### 2. Term Frequency [tf]
#### 3. TF -IDF    [tfidf]
#### 4. Word Embedding using Word2Vec (you can use other packages with slight changes) [wv] 
         # Word Embedding uses defualt 300 values; this will take some time to run

In [None]:
#### in this template, there are also three machine learning methods that can be used
#### 1. Logistic Regression [logit]
#### 2. Random forest (recommended) (rf)
#### 3. XGBoosting  [XGB](word embedding and XGBoost may take long time to complete, combination of both is not recommended in local machine)

#I will keep this repository updated, and I will add more methods in the future

In [6]:
# this is the example of how to use the function
# you can change the vectorizer_name and model_name to the ones you want to use
# for now we will use word embedding and logistic regression
# write the name of your columns in the text_column_name and sentiment_column_name
# the text_column_name is the column name of the text data, and sentiment_column_name is

# run_pipeline function will return the dataframe with the vectorized text, vectorizer used  and the model
# it will also print the results of the model, including the accuracy and F1 score
# note, even without hyperparameter tuning, the model is getting over 70% accuracy in my test
# there may not be a need to perform hyperparameter tuning, but you can set perform_tuning to True if you want to do that

dt= run_pipeline(
    vectorizer_name="glove_25", # BOW, tf, tfidf, wv, glove_25,glove_50, glove_100, gl0ve_200
    model_name="logit", # logit, rf, XGB, nn .#XGB takes long time, can not recommend using it on normal case
    df=df_train,
    text_column_name="processed",  # this is the column name of the text data, 
    sentiment_column_name = "sentiment",
    perform_tuning = False # make this true if you want to perform hyperparameter tuning, it will take longer time and 
                            # may run out of memory if the dataset is large,
)

# missing values in the text data will be removed

--- Running Pipeline for Glove 25 + Logit ---
Labels encoded: Original -> ['NEGATIVE' 'POSITIVE'], Encoded -> [0 1]
1. Splitting data into train/test...
2. Vectorizing  dataset (X)...
Loading pre-trained glove-twitter-25 model (this may take a few minutes)...
Word2Vec model loaded.
Transforming test data using loaded Word2Vec model...
3. Training and predicting...
   - Training Logistic Regression with default parameters (no hyperparameter tuning)...
   - Model trained with default parameters.
Best model parameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
4. Evaluating model...

Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.58      0.25      0.35      1044
    POSITIVE       0.71      0.91      0.

In [7]:
## the dt is a dictionary that contains the results of the model, including the accuracy and F1 score
print(dt.keys())
# you can access the results using the keys of the dictionary
print("Vectorizer used: ", dt["vectorizer_name"])
print("Model used: ", dt["model_object"])
print("Accuracy: ", dt["accuracy"])



dict_keys(['model_object', 'vectorizer_name', 'vectorizer_object', 'label_encoder', 'y_test', 'y_pred', 'accuracy', 'report'])
Vectorizer used:  glove_25
Model used:  LogisticRegression(random_state=42)
Accuracy:  0.69024


### New Dataset for prediction
You can use the same format as the training dataset, but ensure that it contains the "Response" column for text data. The "Sentiment" column is optional for prediction datasets, as it will be generated by the model.
Make sure the dataset is saved in the "New Data" folder and is in CSV format.

In [8]:
new_data = pl.read_csv("demo/new_data/test.csv",encoding='ISO-8859-1') #keep your file here
print(new_data.shape)
new_data= new_data.sample(fraction=0.25, shuffle=True, seed=42)
print(new_data.shape)

(55315, 4)
(13828, 4)


In [9]:
new_data = new_data.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
new_data.head(10)

movieid,reviewerName,isTopCritic,reviewText,processed
str,str,bool,str,str
"""frodo_baggins_…","""Toni Vaughn""",False,"""The result is …","""result unsettl…"
"""stardust_john_…","""Carol Jennings…",False,"""Think twice ab…","""think twice ge…"
"""hermione_grang…","""Tara Huang""",True,"""A film that's …","""film bloody wo…"
"""astonish_valia…","""Shelley Murill…",False,"""...a decent se…","""decent setup e…"
"""indiana_jones_…","""Daniel Bond""",False,"""Inspiring? Not…","""inspiring lame…"
"""glimmer_hannib…","""Mrs. Nicole Fl…",False,"""This perfectly…","""perfectly exec…"
"""brave_anakin_s…","""Melissa Harrin…",False,"""The Richard Cu…","""richard curtis…"
"""katniss_everde…","""Mckenzie Ortiz…",False,"""A fun-filled a…","""funfilled afte…"
"""secret_magic_j…","""Samantha Ware""",False,"""I'm still not …","""still sure sup…"
"""gandalf_the_gr…","""Seth Downs""",False,"""If the lizard …","""lizard ai nt b…"


In [10]:
make_predictions(
    new_data=new_data,
    text_column_name="processed",
    vectorizer=dt["vectorizer_object"],
    best_model=dt["model_object"],
    label_encoder=dt["label_encoder"],
    prediction_column_name="sentiment_predictions"  # Optional custom name
)

movieid,reviewerName,isTopCritic,reviewText,processed,sentiment_predictions
str,str,bool,str,str,str
"""frodo_baggins_…","""Toni Vaughn""",false,"""The result is …","""result unsettl…","""POSITIVE"""
"""stardust_john_…","""Carol Jennings…",false,"""Think twice ab…","""think twice ge…","""NEGATIVE"""
"""hermione_grang…","""Tara Huang""",true,"""A film that's …","""film bloody wo…","""POSITIVE"""
"""astonish_valia…","""Shelley Murill…",false,"""...a decent se…","""decent setup e…","""NEGATIVE"""
"""indiana_jones_…","""Daniel Bond""",false,"""Inspiring? Not…","""inspiring lame…","""POSITIVE"""
"""glimmer_hannib…","""Mrs. Nicole Fl…",false,"""This perfectly…","""perfectly exec…","""POSITIVE"""
"""brave_anakin_s…","""Melissa Harrin…",false,"""The Richard Cu…","""richard curtis…","""POSITIVE"""
"""katniss_everde…","""Mckenzie Ortiz…",false,"""A fun-filled a…","""funfilled afte…","""POSITIVE"""
"""secret_magic_j…","""Samantha Ware""",false,"""I'm still not …","""still sure sup…","""NEGATIVE"""
"""gandalf_the_gr…","""Seth Downs""",false,"""If the lizard …","""lizard ai nt b…","""NEGATIVE"""
