In [9]:
from datapreparator import DataPreparator
import yaml
from pathlib import Path
root_path = Path()
import pandas as pd

df = pd.read_json('../../../data/Books_10k.jsonl', lines=True)
# Load YAML config
root_path = Path()  # or just Path() if you run from project root
config_path = root_path / '../config/dataprep_config.yaml'

# --- Load YAML config properly ---
with open(config_path, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)   # safe_load is recommended (prevents code execution)


# Initialize preparator from YAML
prep = DataPreparator(config)

# Clean and create label column
df_prepared = prep.transform(df)



In [10]:
from transformation import TextTfidfTransformer
from sklearn.model_selection import train_test_split

config_transf_path = root_path / '../config/transformation_config.yaml'
# --- Load YAML config properly ---
with open(config_transf_path, "r", encoding="utf-8") as f:
    config_transf = yaml.safe_load(f)   # safe_load is recommended (prevents code execution)

# Step 2 — Transform to TF-IDF
tfidf_tr = TextTfidfTransformer(config_transf)

TEXT_COL, LABEL_COL = "sentence", "sentiment"

X_train, X_test, y_train, y_test = train_test_split(
    df_prepared[TEXT_COL], df_prepared[LABEL_COL], test_size=0.2, random_state=42, stratify=df_prepared[LABEL_COL]
)

X = tfidf_tr.fit_transform(X_train)
y = y_train

X_t = tfidf_tr.transform(X_test)
y_t = y_test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lgr = LogisticRegression(max_iter=1000, class_weight="balanced")
lgr.fit(X,y)
preds = lgr.predict(X_t)
print("Score:", lgr.score(X_t,y_t))
print()
print(classification_report(y_test, preds))


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


0.5925818392973797

              precision    recall  f1-score   support

           0       0.63      0.61      0.62      4744
           1       0.40      0.42      0.41      3116
           2       0.67      0.67      0.67      5917

    accuracy                           0.59     13777
   macro avg       0.57      0.57      0.57     13777
weighted avg       0.60      0.59      0.59     13777



In [36]:
df[df['verified_purchase']==False]

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
3,1,Written From a Lens of Fear.,Only read and believe things you want to see c...,[],0929385225,0929385225,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,2021-04-05 01:16:52.328,0,False
15,1,"badly and hastily written, without explanation...",My interest is mostly on deep and reinforcemen...,[],1484228650,1484228650,AEIOIVFNO6QX7OYOJNPRKQEWMPGA,2019-06-19 15:13:27.397,1,False
19,1,worst ever,worst woods book i've read. and i have read th...,[],0399169113,0399169113,AFIFCPYKA56DJ26GZ6TKSN5KR6BQ,2014-08-25 19:49:38.000,3,False
23,1,Another story where women have no interests be...,I just finished this book and was sorely disap...,[],1501196022,1501196022,AHSDSJIJUZVBJXQJIKFJVCAOPGYA,2022-03-07 10:27:55.130,0,False
24,1,"Catchy promises, little delivery.","I breezed through this book really fast, more ...",[],B00NIFXWH2,B00NIFXWH2,AHSDSJIJUZVBJXQJIKFJVCAOPGYA,2018-09-04 17:46:06.617,0,False
...,...,...,...,...,...,...,...,...,...,...
9994,2,Bewildering,"I've heard of the 'Mary Celeste', including va...",[],0385533500,0385533500,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-12-30 20:31:58.000,10,False
9996,2,Confusing,The story takes place over the past fifty year...,[],0544077792,0544077792,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-03-28 02:06:55.000,1,False
9997,2,The Case of the Missing Plot,I picked this up because the premise sounded i...,[],B0048EL7YW,B0048EL7YW,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-05 19:07:48.000,0,False
9998,2,Poorly written biography of a fascinating woman,Calling Julia Child 'remarkable' is an underst...,[],0307272222,0307272222,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-01 19:41:44.000,37,False


In [34]:
df[df['user_id']=='AETE7Y3DZT6BLMWA6U27ADJDZ4LA']

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
3612,5,Wonderful Read,Great book in this series,[],0553391984,0553391984,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2021-01-18 22:43:08.160,0,True
3613,5,Good Read / Series,Great part of this series.,[],0778369870,0778369870,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2021-01-18 22:29:47.862,0,True
3614,5,Great Read / Author,Great Read,[],B00HTJUN7K,B00HTJUN7K,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-12-17 22:09:37.040,0,True
3615,5,Excellent Read,Excellent Read. Great Price,[],0778361012,0778361012,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-11-14 20:11:10.878,0,True
3616,5,GREAT READ,Stellar Writing. Great Price from this vendor.,[],0778319024,0778319024,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-11-14 20:09:55.465,0,True
3617,5,EDGE OF SEAT READ,Excellent Read . Fantastic Price,[],0778319180,0778319180,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-11-14 20:08:12.286,0,True
3618,5,STELLAR READ BEAUTIFULLY ILLUSTRATED,Stellar Read & Explanations of Jewish Holy Day...,[],0983532710,0983532710,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-11-04 19:39:58.041,0,True
3619,5,BEAUTIFULLY CREATED * SUMPTUOUS RECIPIES * SOU...,Treated Myself to this Adeena’ s Latest Cookbo...,[],0525533451,0525533451,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2020-08-09 01:43:55.028,3,True
3620,5,Five Stars,100% positive,[],1451660847,1451660847,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2015-01-01 20:23:42.000,0,True
3621,5,Five Stars,100% positive,[],159077034X,159077034X,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2015-01-01 20:23:27.000,0,True


In [33]:
df[df.duplicated(subset=['text', 'user_id'])].sort_values('text')

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
3621,5,Five Stars,100% positive,[],159077034X,159077034X,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2015-01-01 20:23:27.000,0,True
4705,4,Language Arts 2nd Grade Workbook,A Skill Practice workbook for 2nd graders. It...,[],1483841707,1483841707,AG7JSIKUQPSL5AAPRDRKDNWM4U2A,2018-04-11 12:19:19.952,0,False
1906,1,One Star,Book damaged. Unacceptable.,[],1619635186,1619635186,AHMWFC6ZWFFJKSIPMCAOTOWGE7HA,2017-05-25 00:01:55.000,0,True
1905,1,One Star,Book damaged. Unacceptable.,[],0147516145,0147516145,AHMWFC6ZWFFJKSIPMCAOTOWGE7HA,2017-05-25 00:02:14.000,0,True
2270,5,"very, very well received",Bought these for daughter (33 years old) who i...,[],1937994775,1937994775,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2016-01-13 21:44:25.000,0,True
...,...,...,...,...,...,...,...,...,...,...
9232,2,no weaving together==just a bunch of interesti...,so disappointed in this series. the writer is ...,[],0553381695,0553381695,AEGT45MQAI7LJ5IQRHEUFRQMKGIA,2013-08-10 17:54:26.000,0,True
9739,2,terrible,terrible,[],B00H25FJ20,B00H25FJ20,AGYCUB73SSFFXPWLK7MJR6NUGWOA,2016-02-03 20:34:27.000,0,True
3958,5,great,"this is a wonderful book, everything I expected",[],0396081614,0396081614,AGUWL2R2JFLC3K65HLD6AHJV3KBA,2020-09-02 21:19:42.037,0,True
3957,5,great,"this is a wonderful book, everything I expected",[],1101982454,1101982454,AGUWL2R2JFLC3K65HLD6AHJV3KBA,2020-09-02 21:20:57.158,0,True


In [25]:
df[df['asin']=='B0BMT2PL6G']

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
697,1,Deserves ZERO stars,Poorly written. Not much useful information. ...,[],B0BMT2PL6G,B0BMT2PL6G,AFZGIAQOQORI5HDS22LMNLE422OA,2023-02-21 19:38:52.495,0,True
699,1,Deserves ZERO stars,Poorly written. Not much useful information. ...,[],B0BMT2PL6G,B0BMT2PL6G,AFZGIAQOQORI5HDS22LMNLE422OA,2023-02-21 19:38:52.495,0,True


In [27]:
df[df.duplicated(subset=['text'])].sort_values('text')

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
3621,5,Five Stars,100% positive,[],159077034X,159077034X,AETE7Y3DZT6BLMWA6U27ADJDZ4LA,2015-01-01 20:23:27.000,0,True
4705,4,Language Arts 2nd Grade Workbook,A Skill Practice workbook for 2nd graders. It...,[],1483841707,1483841707,AG7JSIKUQPSL5AAPRDRKDNWM4U2A,2018-04-11 12:19:19.952,0,False
1906,1,One Star,Book damaged. Unacceptable.,[],1619635186,1619635186,AHMWFC6ZWFFJKSIPMCAOTOWGE7HA,2017-05-25 00:01:55.000,0,True
1905,1,One Star,Book damaged. Unacceptable.,[],0147516145,0147516145,AHMWFC6ZWFFJKSIPMCAOTOWGE7HA,2017-05-25 00:02:14.000,0,True
9795,2,Plain. Cheap,Boring,[],1780553331,1780553331,AHU2QGPRV4UIPYLVHEHUHB2YHIVQ,2019-01-10 08:29:41.964,0,True
...,...,...,...,...,...,...,...,...,...,...
9233,2,no cohesion or interweaving,so disappointed in this series. the writer is ...,[],055358202X,055358202X,AEGT45MQAI7LJ5IQRHEUFRQMKGIA,2013-08-10 17:53:22.000,3,True
9739,2,terrible,terrible,[],B00H25FJ20,B00H25FJ20,AGYCUB73SSFFXPWLK7MJR6NUGWOA,2016-02-03 20:34:27.000,0,True
3958,5,great,"this is a wonderful book, everything I expected",[],0396081614,0396081614,AGUWL2R2JFLC3K65HLD6AHJV3KBA,2020-09-02 21:19:42.037,0,True
3957,5,great,"this is a wonderful book, everything I expected",[],1101982454,1101982454,AGUWL2R2JFLC3K65HLD6AHJV3KBA,2020-09-02 21:20:57.158,0,True


In [19]:
orig_df

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment,sentence
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,0,It is definitely not a watercolor book.
22,1,Missing the sketch pad,Missing the sketch pad. Even worse I realized ...,[],1631591290,1631591290,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-08-05 04:28:04.910,0,True,0,Missing the sketch pad.
24,1,Crease down entire side of every page!!!,Every page has a crease running the entire len...,[{'small_image_url': 'https://images-na.ssl-im...,1780671067,1780671067,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-01-26 01:07:03.325,2,True,0,Every page has a crease running the entire len...
25,1,Written From a Lens of Fear.,Only read and believe things you want to see c...,[],0929385225,0929385225,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,2021-04-05 01:16:52.328,0,False,0,Only read and believe things you want to see c...
28,1,Good if your little one is unsure/scared of th...,My little one just likes doctors so I thought ...,[],0593426452,0593426452,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2022-03-18 04:24:46.871,1,True,0,My little one just likes doctors so I thought ...
...,...,...,...,...,...,...,...,...,...,...,...,...
68828,2,Worth every cent I paid,Apparently there are readers who enjoyed this ...,[],B00AAGZ1S0,B00AAGZ1S0,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-06-24 00:18:07.000,2,True,0,Apparently there are readers who enjoyed this ...
68832,2,Confusing,The story takes place over the past fifty year...,[],0544077792,0544077792,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-03-28 02:06:55.000,1,False,0,The story takes place over the past fifty year...
68840,2,The Case of the Missing Plot,I picked this up because the premise sounded i...,[],B0048EL7YW,B0048EL7YW,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-05 19:07:48.000,0,False,0,I picked this up because the premise sounded i...
68849,2,Poorly written biography of a fascinating woman,Calling Julia Child 'remarkable' is an underst...,[],0307272222,0307272222,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-01 19:41:44.000,37,False,0,Calling Julia Child 'remarkable' is an underst...


In [37]:
TEXT_COL = 'text'

orig_df = df_prepared.drop_duplicates(subset=['text', 'sentiment'])
X_train, X_test, y_train, y_test = train_test_split(
    orig_df[TEXT_COL], orig_df[LABEL_COL], test_size=0.2, random_state=42, stratify=orig_df[LABEL_COL]
)