In [1]:
#load the packages
import polars as pl
#make sure that this is the main file
import sys
import os
import nltk
project_root = os.getcwd()
sys.path.append(project_root)

### Training Dataset


In [2]:
# keep you training dataset in the training data folder
# this template uses csv files 
# column names can be set in Python but this template does not automatically update the column for the demo 
# however, the function will give you the option to tell column names for the text and label data

df_train = pl.read_csv("Training Data/Train.csv",encoding='ISO-8859-1') 
print(f"Dataset shape: {df_train.shape[0]} rows and {df_train.shape[1]} columns")


Dataset shape: 162758 rows and 5 columns


### DEMO

In [3]:
df_train.head()
df_train.fill_null("")

df_train

movieid,reviewerName,isFrequentReviewer,reviewText,sentiment
str,str,bool,str,str
"""marvelous_pirate""","""Benjamin Henry""",false,"""Henry Selickâs first movie s…","""POSITIVE"""
"""tony_montana_frodo_baggins_v_r…","""Felicia Lopez""",false,"""With a cast that reads like th…","""NEGATIVE"""
"""darth_vader_katniss_everdeen_s…","""Mr. Charles Burgess""",true,"""Creed II does not give us anyt…","""POSITIVE"""
"""lara_croft_glimmer""","""Ryan Barrett""",false,"""I know what you're thinking, b…","""POSITIVE"""
"""jason_bourne_surreal_the_termi…","""Alexander Glover""",false,"""Director Fernando Meirelles te…","""POSITIVE"""
…,…,…,…,…
"""the_joker_ethereal_captain_jac…","""Danny Mueller""",false,"""A top-notch thriller with genu…","""POSITIVE"""
"""e.t._hannibal_lecter_vito_corl…","""Jennifer Clayton""",true,"""Some people find Derek Zooland…","""NEGATIVE"""
"""infinite_enigma_luke_skywalker""","""Bryan Wilson""",false,"""This fun, gentle comedy focuse…","""POSITIVE"""
"""emerald_oracle_iron_man_wolver…","""Erik Parker""",false,"""The film is rescued by a stron…","""NEGATIVE"""


In [4]:
# here I have three python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
import importlib
from tools.preprocess import pre_process
#this function will run the sentimental analysis in the training data and return the results
from tools.pipeline import run_pipeline
# this function will run the sentimental analysis in the new data and return the predictions
from tools.predict import predict_pipeline



NLTK data 'punkt' already present.
NLTK data 'stopwords' already present.
Downloading NLTK data: wordnet...
NLTK data 'wordnet' downloaded.
Downloading NLTK data: omw-1.4...
NLTK data 'omw-1.4' downloaded.
NLTK data 'wordnet' downloaded.
Downloading NLTK data: omw-1.4...
NLTK data 'omw-1.4' downloaded.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# you can use the pre_process function to clean the text data
response_column = "reviewText" # feel free to change the column name to your text column name
sentiment_column = "sentiment" # feel free to change the column name to your label column name
df_train[response_column][0] , pre_process(df_train[response_column][0]) 

('Henry Selickâ\x80\x99s first movie since 2009â\x80\x99s Coraline. His fifth stop-motion masterpiece.',
 'henry selickas first movie since 2009as coraline fifth stopmotion masterpiece',
 'henry selickas first movie since 2009as coraline fifth stopmotion masterpiece')

In [6]:
# make changes as necessary
# inside the map_elements, add  the parameters [pre_process(x, parameters_to_be_added)] and set it True/False if it differs from the defualt value
df_train = df_train.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)
df_train.head(10)

  df_train = df_train.with_columns(


movieid,reviewerName,isFrequentReviewer,reviewText,sentiment,processed
str,str,bool,str,str,str
"""marvelous_pirate""","""Benjamin Henry""",False,"""Henry Selickâs first movie s…","""POSITIVE""","""henry selickas first movie sin…"
"""tony_montana_frodo_baggins_v_r…","""Felicia Lopez""",False,"""With a cast that reads like th…","""NEGATIVE""","""cast read like vogue oscar par…"
"""darth_vader_katniss_everdeen_s…","""Mr. Charles Burgess""",True,"""Creed II does not give us anyt…","""POSITIVE""","""creed ii give u anything anoth…"
"""lara_croft_glimmer""","""Ryan Barrett""",False,"""I know what you're thinking, b…","""POSITIVE""","""know thinking limitless bradle…"
"""jason_bourne_surreal_the_termi…","""Alexander Glover""",False,"""Director Fernando Meirelles te…","""POSITIVE""","""director fernando meirelles te…"
"""enigma_mystique_secret""","""Morgan Hurst""",True,"""""Kajillionaire"" is a rich piec…","""POSITIVE""","""kajillionaire rich piece story…"
"""indiana_jones_sherlock_holmes_…","""Kari Wolf""",False,"""A heartfelt story with a lovel…","""POSITIVE""","""heartfelt story lovely perform…"
"""john_mcclane_james_t._kirk_ben…","""Johnny Caldwell""",False,"""If a bit long for a cartoon fe…","""POSITIVE""","""bit long cartoon feature proba…"
"""starlight_travis_bickle_tyler_…","""Michael Chavez""",True,"""Anchored by a charming perform…","""POSITIVE""","""anchored charming performance …"
"""frodo_baggins_quest_darth_vade…","""Catherine Clements""",False,"""It's largely a Hanks solo show…","""POSITIVE""","""largely hank solo show beloved…"


In [None]:
#### in this template, there are four text representation / vectorizer methods available 
#### #in the function run_pipeline, we shall make use of this, write the words inside [ ] for the methods you want to use
#### 1. Bag of Words [BOW] 
#### 2. Term Frequency [tf]
#### 3. TF -IDF    [tfidf]
#### 4. Word Embedding using Word2Vec (you can use other packages with slight changes) [wv] 
         # Word Embedding uses defualt 300 values; this will take some time to run

In [None]:
#### in this template, there are also three machine learning methods that can be used
#### 1. Logistic Regression [logit]
#### 2. Random forest (recommended) (rf)
#### 3. XGBoosting  [XGB](word embedding and XGBoost may take long time to complete, combination of both is not recommended in local machine)

#I will keep this repository updated, and I will add more methods in the future

In [10]:
# this is the example of how to use the function
# you can change the vectorizer_name and model_name to the ones you want to use
# for now we will use word embedding and logistic regression
# write the name of your columns in the text_column_name and sentiment_column_name
# the text_column_name is the column name of the text data, and sentiment_column_name is

# run_pipeline function will return the dataframe with the vectorized text, vectorizer used  and the model
# it will also print the results of the model, including the accuracy and F1 score
dt= run_pipeline(
    vectorizer_name="wv", # BOW, tf, tfidf, wv
    model_name="logit", # logit, rf, XGB .#XGB takes long time, can not recommend using it on normal case
    df=df_train,
    text_column_name="processed",  # this is the column name of the text data, 
    sentiment_column_name = "sentiment"  # this is the column name of the label data,
)


--- Running Pipeline for Wv + Logit ---
Labels encoded: Original -> ['NEGATIVE' 'POSITIVE'], Encoded -> [0 1]
1. Vectorizing entire dataset (X)...
Loading pre-trained word2vec-google-news-300 model (this may take a few minutes)...
Word2Vec model loaded.
2. Splitting data into train/test...
3. Training and predicting...
   - Starting Logistic Regression training with GridSearchCV for hyperparameter tuning...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

   - Best Hyperparameters found:
{'C': 10.0, 'class_weight': None, 'max_iter': 500, 'solver': 'liblinear'}
   - Best Cross-Validation Score (F1-weighted): 0.7592
Best model parameters: {'C': 10.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
4. Evaluating model...

Classification Report:
        

In [12]:
## the dt is a dictionary that contains the results of the model, including the accuracy and F1 score
print(dt.keys())
# you can access the results using the keys of the dictionary
print("Vectorizer used: ", dt["vectorizer_name"])
print("Model used: ", dt["model_object"])
print("Accuracy: ", dt["accuracy"])



dict_keys(['model_object', 'vectorizer_name', 'vectorizer_object', 'label_encoder', 'y_test', 'y_pred', 'accuracy', 'report'])
Vectorizer used:  wv
Model used:  LogisticRegression(C=10.0, max_iter=500, random_state=42, solver='liblinear')
Accuracy:  0.7724146754949941


### New Dataset for prediction
You can use the same format as the training dataset, but ensure that it contains the "Response" column for text data. The "Sentiment" column is optional for prediction datasets, as it will be generated by the model.
Make sure the dataset is saved in the "New Data" folder and is in CSV format.

In [18]:
new_data = pl.read_csv("New Data/test.csv",encoding='ISO-8859-1') #keep your file here
new_data.head(5)

movieid,reviewerName,isTopCritic,reviewText
str,str,bool,str
"""legend_marty_mcfly_oracle""","""John Kim""",False,"""Green slowly cranks up the dre…"
"""terminator_katniss_everdeen_gl…","""Brian Chaney""",False,"""Philip Noyce's direction is el…"
"""james_bond_labyrinth_gollum""","""Danielle Parker""",False,"""It wouldn't do to say what pat…"
"""v_quest_han_solo_wondrous""","""Brittany Lane""",False,"""Pig is not exactly the arthous…"
"""enigma_hulk_surreal_starlight""","""Justin Willis""",False,"""An imaginative no-budget music…"


In [19]:
new_data = new_data.with_columns(
    pl.col("reviewText").map_elements(pre_process).alias("processed")  #add inside the map_elements
)
new_data.head(5)

  new_data = new_data.with_columns(


movieid,reviewerName,isTopCritic,reviewText,processed
str,str,bool,str,str
"""legend_marty_mcfly_oracle""","""John Kim""",False,"""Green slowly cranks up the dre…","""green slowly crank dread style…"
"""terminator_katniss_everdeen_gl…","""Brian Chaney""",False,"""Philip Noyce's direction is el…","""philip noyce direction elegant…"
"""james_bond_labyrinth_gollum""","""Danielle Parker""",False,"""It wouldn't do to say what pat…","""would nt say path maria ultima…"
"""v_quest_han_solo_wondrous""","""Brittany Lane""",False,"""Pig is not exactly the arthous…","""pig exactly arthouse john wick…"
"""enigma_hulk_surreal_starlight""","""Justin Willis""",False,"""An imaginative no-budget music…","""imaginative nobudget musical s…"


In [21]:
# DO NOT CHANGE THE CODE BELOW
from sklearn.preprocessing import LabelEncoder

vectorizer_func = dt["vectorizer_name"] 
ml_model=dt["model_object"]

new_data = new_data.with_columns(
    pl.Series(name="predictions", values=predict_pipeline(
        df = new_data,
        text_column_name = "processed",  # this is the column name of the text data in the new data
        vectorizer_func = vectorizer_func,  # this is the vectorizer function used in the training data
        ml_model = ml_model))
)

# Convert numeric predictions to letter labels
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
new_data = new_data.with_columns(
    pl.col("predictions").map_elements(lambda x: label_map.get(x, x)).alias("predictions_label")
)

new_data.head(25)  # Display the first 10 rows of the DataFrame with predictions


Using already loaded Word2Vec model.
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\meala\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\meala\AppData\Local\Temp\ipykernel_22108\1753209415.py", line 8, in <module>
    pl.Series(name="predictions", values=predict_pipeline(
                                         ^^^^^^^^^^^^^^^^^
  File "c:\Users\meala\Dropbox\Data Work\Text NLP\tools\predict.py", line 32, in predict_pipeline
  File "c:\Users\meala\Dropbox\Data Work\Text NLP\Vect\wv.py", line 33, in vectorize
    tokenized = [sentence.split() for sentence in texts]
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\meala\Dropbox\Data Work\Text NLP\Vect\wv.py", line 33, in <listcomp>
    tokenized = [sentence.split() for sentence in texts]
                 ^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'split'

During handling of the above exception, another exception 