In [1]:
#load the packages
import polars as pl
#make sure that this is the main file
import sys
import os
project_root = os.getcwd()
sys.path.insert(0, project_root)
# here I have three python script I built to pre_process the data and running the pipeline
# you can find the code in the tools/preprocess.py file
# you can find  the code in the tools/pipeline.py file
# the pre_process function is used to clean the text data, there are various options available, please check the tools/preprocess.py file for details
# the run_pipeline function is used to run the sentimental analysis pipeline, it takes the training data and the vectorizer and machine learning methods as input, and returns the results
from tools.preprocess import pre_process
#this function will run the sentimental analysis in the training data and return the results
from tools.pipeline import run_pipeline
# this function will run the sentimental analysis in the new data and return the predictions
from tools.predict import make_predictions


NLTK data 'punkt' already present.
NLTK data 'stopwords' already present.
Downloading NLTK data: wordnet...
NLTK data 'wordnet' downloaded.
Downloading NLTK data: omw-1.4...
NLTK data 'omw-1.4' downloaded.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meala\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
path1 = "Training Data/Train.csv" #giive path to the training data
df_train = pl.read_csv(path1, has_header=True, encoding="utf8")

In [3]:
# you can use the pre_process function to clean the text data
response_column = "reviewText" # feel free to change the column name to your text column name
sentiment_column = "sentiment" # feel free to change the column name to your label column name

In [4]:
df_train = df_train.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x)).alias("processed")  #add inside the map_elements
)

  df_train = df_train.with_columns(


In [15]:
dt= run_pipeline(
    vectorizer_name="BOW", # BOW, tf, tfidf, wv
    model_name="logit", # logit, rf, XGB .#XGB takes long time, can not recommend using it on normal case
    df=df_train,
    text_column_name="processed",  # this is the column name of the text data, 
    sentiment_column_name = "sentiment",
    perform_tuning = False # make this true if you want to perform hyperparameter tuning, it will take longer time and 
                            # may run out of memory if the dataset is large,
)

--- Running Pipeline for Bow + Logit ---
Labels encoded: Original -> ['NEGATIVE' 'POSITIVE'], Encoded -> [0 1]
1. Vectorizing entire dataset (X)...
   - Generating Bag-of-Words features...
2. Splitting data into train/test...
3. Training and predicting...
   - Training Logistic Regression with default parameters (no hyperparameter tuning)...
   - Model trained with default parameters.
Best model parameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
4. Evaluating model...

Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.74      0.63      0.68     10319
    POSITIVE       0.83      0.89      0.86     20944

    accuracy                           0.80     31263
   macro avg       0.78      0.76      0.77

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
path2 = "New Data/Test.csv" #give path to the test data
df_test = pl.read_csv(path2, has_header=True, encoding="utf8")

In [17]:
new_data = df_test.with_columns(
    pl.col(response_column).map_elements(lambda x: pre_process(x, remove_brackets=True)).alias("processed")  #add inside the map_elements
)

  new_data = df_test.with_columns(


In [18]:
sentiments_prediction= make_predictions(
    new_data=new_data,
    text_column_name="processed",
    vectorizer=dt["vectorizer_object"],
    best_model=dt["model_object"],
    label_encoder=dt["label_encoder"],
    prediction_column_name="sentiment_predictions"  # Optional custom name
)

In [None]:
# export the predictions to a csv file
# missing values in the text data will be removed
sentiments_prediction.write_csv("New Data/sentiments_prediction.csv")