In [1]:
#load the packages
import polars as pl
#make sure that this is the main file
import sys
import os
project_root = os.getcwd()
sys.path.insert(0, project_root)

### Training Dataset


Name your data as Train.csv and place it in the Training Data folder. Or you can change the path in the code below.


In [2]:
# keep you training dataset in the training data folder
# this template uses csv files 
# column names can be set in Python but this template does not automatically update the column for the demo 
# however, the function will give you the option to tell column names for the text and label data

df_train = pl.read_csv("Training Data/Train.csv",encoding='ISO-8859-1') 
print(f"Dataset shape: {df_train.shape[0]} rows and {df_train.shape[1]} columns")


Dataset shape: 162758 rows and 5 columns


### DEMO

In [None]:
df_train.head()
# randomly select only 10% of the data since the dataset is large
#RUN ONLY ONCE
df_train = df_train.sample(fraction=0.1, shuffle=True, seed=42) 
df_train

movieid,reviewerName,isFrequentReviewer,reviewText,sentiment
str,str,bool,str,str
"""glorious_starlight_v_dorothy_g…","""Mandy Ponce""",false,"""Superb acting adds weight to s…","""POSITIVE"""
"""forrest_gump_whisper""","""Kelly Yang""",false,"""Rust Creek just wants to tell …","""POSITIVE"""
"""journey_mystique_jason_bourne""","""Shelley Murillo""",false,"""Vertical Limit is about as sil…","""POSITIVE"""
"""james_t._kirk_ellis_redding_au…","""Gwendolyn Guerra""",false,"""The sort of production that co…","""NEGATIVE"""
"""gollum_norman_bates""","""Sharon Foster""",false,"""[I]t might have benefited from…","""NEGATIVE"""
…,…,…,…,…
"""hulk_scarlett_o'hara_don_vito_…","""Alicia Davila""",false,"""Quite simply, Arrival is one o…","""POSITIVE"""
"""epic_mythical_sherlock_holmes_…","""Rebekah Henry""",false,"""No matter how many A-Listers d…","""NEGATIVE"""
"""harry_potter_whirlwind_moonlit…","""Mrs. Brenda Ferguson""",true,"""Despite a lightness of plot, i…","""POSITIVE"""
"""hulk_scarlett_o'hara_don_vito_…","""Fernando Shepherd""",true,"""An outstanding sci-fi film wit…","""POSITIVE"""


The dataset is for training. The sentiments are already labeled. This will allow us to train a model that can predict sentiments on new data.


In [5]:
# here I have three python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
from tools.preprocess import pre_process
#this function will run the sentimental analysis in the training data and return the results
from tools.pipeline import run_pipeline
# this function will run the sentimental analysis in the new data and return the predictions
from tools.predict import make_predictions



NLTK data 'punkt' already present.
NLTK data 'stopwords' already present.
Downloading NLTK data: wordnet...
NLTK data 'wordnet' downloaded.
Downloading NLTK data: omw-1.4...
NLTK data 'omw-1.4' downloaded.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# you can use the pre_process function to clean the text data
response_column = "reviewText" # this is the column name for the text data, feel free to change it to your text column name
sentiment_column = "sentiment" # this is the column name for the sentiment data, feel free to change it to your sentiment column name
print(df_train[response_column][2])
print("\n" + pre_process(df_train[response_column][2]) )

Vertical Limit is about as silly as movies come, but the only thing that really counts is whether or not the action sequences are exciting. And they are, in a big way.

vertical limit silly movie come thing really count whether action sequence exciting big way


In [None]:
# make changes as necessary
# inside the map_elements, add  the parameters [pre_process(x, parameters_to_be_added)] and set it True/False if it differs from the defualt value
# check the tools/preprocess.py file for the parameters and their default values
# some of the parameters are remove_brackets, remove_stopwords, remove_punctuation, remove_numbers, remove_emojis, remove_urls, remove_html_tags, lemmatize, stem, lowercase
df_train = df_train.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
df_train.head(10)

  df_train = df_train.with_columns(


movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,processed
str,str,bool,str,str,str
"""glorious_starlight_v_dorothy_g…","""Mandy Ponce""",False,"""Superb acting adds weight to s…","""POSITIVE""","""superb acting add weight scene…"
"""forrest_gump_whisper""","""Kelly Yang""",False,"""Rust Creek just wants to tell …","""POSITIVE""","""rust creek want tell lean mean…"
"""journey_mystique_jason_bourne""","""Shelley Murillo""",False,"""Vertical Limit is about as sil…","""POSITIVE""","""vertical limit silly movie com…"
"""james_t._kirk_ellis_redding_au…","""Gwendolyn Guerra""",False,"""The sort of production that co…","""NEGATIVE""","""sort production could performe…"
"""gollum_norman_bates""","""Sharon Foster""",False,"""[I]t might have benefited from…","""NEGATIVE""","""might benefited little emotion…"
"""fortune_the_terminator_t-800""","""Mallory Chung""",False,"""...Egoyan's ""Memento"" by way o…","""POSITIVE""","""egoyan memento way must place …"
"""rocky_balboa_forrest_gump""","""Jeffrey Garner""",False,"""With Kingdom of Heaven, Ridley…","""POSITIVE""","""kingdom heaven ridley prof goo…"
"""spectacular_dorothy_gale_surre…","""Don Hodges""",True,"""A measured, deathly serious ep…","""POSITIVE""","""measured deathly serious epic"""
"""golden_captain_america_beneath""","""Sarah Gray""",False,"""As much as returning director …","""NEGATIVE""","""much returning director adam r…"
"""heroic_myriad_dracula_willy_wo…","""Larry Greer""",False,"""Reminds us how good romantic c…","""POSITIVE""","""reminds u good romantic comedy…"


In [None]:
#### in this template, there are four text representation / vectorizer methods available 
#### in the function run_pipeline (in python cell below), we shall make use of this, write the words inside [ ] for the methods you want to use
#### 1. Bag of Words [BOW] 
#### 2. Term Frequency [tf]
#### 3. TF -IDF    [tfidf]
#### 4. Word Embedding using Word2Vec (you can use other packages with slight changes) [wv] 
         # Word Embedding uses defualt 300 values; this will take some time to run

In [None]:
#### in this template, there are also three machine learning methods that can be used
#### 1. Logistic Regression [logit]
#### 2. Random forest (recommended) (rf)
#### 3. XGBoosting  [XGB](word embedding and XGBoost may take long time to complete, combination of both is not recommended in local machine)

#I will keep this repository updated, and I will add more methods in the future

In [None]:
# this is the example of how to use the function
# you can change the vectorizer_name and model_name to the ones you want to use
# for now we will use word embedding and logistic regression
# write the name of your columns in the text_column_name and sentiment_column_name
# the text_column_name is the column name of the text data, and sentiment_column_name is

# run_pipeline function will return the dataframe with the vectorized text, vectorizer used  and the model
# it will also print the results of the model, including the accuracy and F1 score
# note, even without hyperparameter tuning, the model is getting over 70% accuracy in my test
# there may not be a need to perform hyperparameter tuning, but you can set perform_tuning to True if you want to do that

dt= run_pipeline(
    vectorizer_name="wv", # BOW, tf, tfidf, wv
    model_name="logit", # logit, rf, XGB .#XGB takes long time, can not recommend using it on normal case
    df=df_train,
    text_column_name="processed",  # this is the column name of the text data, 
    sentiment_column_name = "sentiment",
    perform_tuning = True # make this true if you want to perform hyperparameter tuning, it will take longer time and 
                            # may run out of memory if the dataset is large,
)

# missing values in the text data will be removed

--- Running Pipeline for Tf + Rf ---
Labels encoded: Original -> ['NEGATIVE' 'POSITIVE'], Encoded -> [0 1]
1. Vectorizing entire dataset (X)...
   - Generating TF features...
2. Splitting data into train/test...
3. Training and predicting...
   - Starting Random Forest training with GridSearchCV for hyperparameter tuning...
   - Using default parameter grid for tuning: {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'class_weight': [None, 'balanced']}
Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

In [12]:
## the dt is a dictionary that contains the results of the model, including the accuracy and F1 score
print(dt.keys())
# you can access the results using the keys of the dictionary
print("Vectorizer used: ", dt["vectorizer_name"])
print("Model used: ", dt["model_object"])
print("Accuracy: ", dt["accuracy"])



dict_keys(['model_object', 'vectorizer_name', 'vectorizer_object', 'label_encoder', 'y_test', 'y_pred', 'accuracy', 'report'])
Vectorizer used:  tf
Model used:  LogisticRegression(random_state=42)
Accuracy:  0.6768


### New Dataset for prediction
You can use the same format as the training dataset, but ensure that it contains the "Response" column for text data. The "Sentiment" column is optional for prediction datasets, as it will be generated by the model.
Make sure the dataset is saved in the "New Data" folder and is in CSV format.

In [13]:
new_data = pl.read_csv("New Data/test.csv",encoding='ISO-8859-1') #keep your file here
print(new_data.shape)
new_data= new_data.sample(fraction=0.25, shuffle=True, seed=42)
print(new_data.shape)

(55315, 4)
(13828, 4)


In [14]:
new_data = new_data.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
new_data.head(10)

  new_data = new_data.with_columns(


movieid,reviewerName,isTopCritic,reviewText,processed
str,str,bool,str,str
"""frodo_baggins_rocky_balboa_she…","""Toni Vaughn""",False,"""The result is an unsettling ta…","""result unsettling tale human p…"
"""stardust_john_mcclane""","""Carol Jennings""",False,"""Think twice about getting invo…","""think twice getting involved w…"
"""hermione_granger_sherlock_holm…","""Tara Huang""",True,"""A film that's so bloody wonder…","""film bloody wonderful meaning …"
"""astonish_valiant""","""Shelley Murillo""",False,"""...a decent setup that's emplo…","""decent setup employed progress…"
"""indiana_jones_dazzling_dorothy…","""Daniel Bond""",False,"""Inspiring? Not to me. Lamentab…","""inspiring lamentably bland exe…"
"""glimmer_hannibal_lecter_frodo_…","""Mrs. Nicole Fleming""",False,"""This perfectly executed piece …","""perfectly executed piece movie…"
"""brave_anakin_skywalker""","""Melissa Harrington""",False,"""The Richard Curtis script has …","""richard curtis script rooney m…"
"""katniss_everdeen_superman""","""Mckenzie Ortiz""",False,"""A fun-filled afternoon of dino…","""funfilled afternoon dinotastic…"
"""secret_magic_john_wick_legend""","""Samantha Ware""",False,"""I'm still not sure what this i…","""still sure supposed save bunch…"
"""gandalf_the_grey_magic_wandere…","""Seth Downs""",False,"""If the lizard ain't broken, do…","""lizard ai nt broken nt fix"""


In [15]:
make_predictions(
    new_data=new_data,
    text_column_name="processed",
    vectorizer=dt["vectorizer_object"],
    best_model=dt["model_object"],
    label_encoder=dt["label_encoder"],
    prediction_column_name="sentiment_predictions"  # Optional custom name
)

movieid,reviewerName,isTopCritic,reviewText,processed,sentiment_predictions
str,str,bool,str,str,str
"""frodo_baggins_rocky_balboa_she…","""Toni Vaughn""",false,"""The result is an unsettling ta…","""result unsettling tale human p…","""POSITIVE"""
"""stardust_john_mcclane""","""Carol Jennings""",false,"""Think twice about getting invo…","""think twice getting involved w…","""NEGATIVE"""
"""hermione_granger_sherlock_holm…","""Tara Huang""",true,"""A film that's so bloody wonder…","""film bloody wonderful meaning …","""NEGATIVE"""
"""astonish_valiant""","""Shelley Murillo""",false,"""...a decent setup that's emplo…","""decent setup employed progress…","""NEGATIVE"""
"""indiana_jones_dazzling_dorothy…","""Daniel Bond""",false,"""Inspiring? Not to me. Lamentab…","""inspiring lamentably bland exe…","""NEGATIVE"""
…,…,…,…,…,…
"""vivid_intrigue_celestial""","""Luke Reyes""",false,"""This is the film's ultimate me…","""film ultimate message nt think…","""NEGATIVE"""
"""mystery_lost_james_t._kirk_sta…","""Kathy Wade""",false,"""It's the best Sondheim adaptat…","""best sondheim adaptation sayin…","""POSITIVE"""
"""hiccup_terminator_myriad_capta…","""Troy Watson""",false,"""The South Australian actor her…","""south australian actor deliver…","""NEGATIVE"""
"""rocky_balboa_dazzling_phantom""","""Wanda Peterson""",true,"""Too slight to register as anyt…","""slight register anything stunt…","""NEGATIVE"""
