# Tokenization

In [1]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [2]:
import keras
import tensorflow as tf

print("Keras version:", keras.__version__)
print("TensorFlow version:", tf.__version__)


Keras version: 3.6.0
TensorFlow version: 2.18.0


In [3]:
pip install tf-keras





In [1]:
import spacy
import pandas as pd

In [13]:
df = pd.read_csv("twitter_training.csv")
df = df.set_axis(["S.no","Game","review","para"],axis = 1)
data = df
data.head()

Unnamed: 0,S.no,Game,review,para
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [11]:
nlp = spacy.load("en_core_web_sm")
val = nlp("hello i am under the water please help me. mr.dwara there two much raining uhhhhh.take this 2$")
for i in val:
    print(i)
  

hello
i
am
under
the
water
please
help
me
.
mr.dwara
there
two
much
raining
uhhhhh.take
this
2
$


In [7]:
tok = val[-1]
tok

$

In [8]:
tok.like_num

False

In [9]:
tok.is_currency

True

In [15]:
o = ' '.join(df["para"].tail())
count = 0
doc =  nlp(o)
for i in doc.sents:
    print(i.text)
    count = count + 1
count

Just realized that the Windows partition of my Mac is like 6 years behind Nvidia drivers and I have no idea how I did not notice Just realized that my Mac window partition is 6 years behind on Nvidia drivers and I have no idea how I didn't notice Just realized the windows partition of my Mac is now 6 years behind on Nvidia drivers and I have no idea how he didn’t notice Just realized between the windows partition of my Mac is like being 6 years behind on Nvidia drivers and cars I have no fucking idea how I ever didn ’ t notice Just like the windows partition of my Mac is like 6 years behind on its drivers
So you have no idea how I didn’t notice


2

In [11]:
df.tail()

Unnamed: 0,2401,Borderlands,Positive,para
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...
74680,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


# pipline

In [12]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1a8d6f5cef0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1a8d6f5c4d0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1a8d7d2c5f0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1a8d6f6f810>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1a8d6188d50>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1a8d7d2c7b0>)]

In [31]:
val = nlp("ate i am under the water please help me. there two much raining uhhhhh. take this 2$")
for i in val:
    print(i," | ",i.pos_," | ",i.lemma_," | ")

ate  |  VERB  |  eat  | 
i  |  PRON  |  I  | 
am  |  AUX  |  be  | 
under  |  ADP  |  under  | 
the  |  DET  |  the  | 
water  |  NOUN  |  water  | 
please  |  INTJ  |  please  | 
help  |  VERB  |  help  | 
me  |  PRON  |  I  | 
.  |  PUNCT  |  .  | 
there  |  ADV  |  there  | 
two  |  NUM  |  two  | 
much  |  ADJ  |  much  | 
raining  |  VERB  |  rain  | 
uhhhhh  |  ADJ  |  uhhhhh  | 
.  |  PUNCT  |  .  | 
take  |  VERB  |  take  | 
this  |  PRON  |  this  | 
2  |  NUM  |  2  | 
$  |  SYM  |  $  | 


In [15]:
for i in val.ents:
    print(i," | ",i.label_," | ",spacy.explain(i.label_))

two  |  CARDINAL  |  Numerals that do not fall under another type
2$  |  MONEY  |  Monetary values, including unit


# Bag of words

In [16]:
data["Positive"].value_counts()/len(data)*100

Positive
Negative      30.184384
Positive      27.893306
Neutral       24.528327
Irrelevant    17.393982
Name: count, dtype: float64

In [17]:
data.isnull().sum()

2401             0
Borderlands      0
Positive         0
para           686
dtype: int64

In [18]:
data = data.dropna()

In [19]:
data.isnull().sum()

2401           0
Borderlands    0
Positive       0
para           0
dtype: int64

In [20]:
def new_col(x):
    if x == "Positive":
        return 1
    elif x == "Negative":
        return 0
    elif x == "Neutral":
        return 2
    else:
        return 3

In [21]:
data["outcome"] = data["Positive"].apply(lambda x: new_col(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["outcome"] = data["Positive"].apply(lambda x: new_col(x))


In [22]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test = tts(data.para,data.outcome,test_size=0.2)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((59196,), (14799,), (59196,), (14799,))

In [23]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(stop_words = "english")
x_train_cv = cv.fit_transform(x_train)
x_test_cv = cv.transform(x_test)


In [24]:
cv.get_feature_names_out()[1000:1010]

array(['3kg', '3kxysblxd', '3kxysblxxd', '3ldclwb', '3lghp0e', '3lmexir',
       '3lpelw7', '3lrfp5w', '3lvkba9', '3m'], dtype=object)

In [25]:
cv.vocabulary_

{'johnson': 14427,
 'pic': 19593,
 'year': 29036,
 '1915': 304,
 'armenian': 2888,
 'killed': 14879,
 'ottoman': 18820,
 'civils': 5797,
 'babies': 3421,
 'having': 12398,
 'land': 15208,
 'armenia': 2887,
 'says': 22725,
 'genocide': 11321,
 'devils': 7897,
 'reaction': 21159,
 'twitter': 26800,
 'com': 6142,
 'qbqhilg42t': 20716,
 '2ksupport': 634,
 'nba2k': 17743,
 'small': 23863,
 'boy': 4522,
 'new': 17876,
 'experience': 9671,
 'game': 11099,
 'relies': 21536,
 'teach': 25498,
 'play': 19749,
 'banned': 3570,
 'creating': 6860,
 'ticket': 26041,
 'figure': 10237,
 'got': 11764,
 'reply': 21658,
 'week': 28215,
 'later': 15272,
 'claiming': 5812,
 'boosting': 4428,
 'doesnt': 8348,
 'know': 14993,
 'taught': 25438,
 'rainbow6game': 20972,
 'internet': 13842,
 'disconnected': 8102,
 'unranked': 27211,
 'restart': 21793,
 'restarting': 21795,
 'attempting': 3152,
 'reconnect': 21277,
 'cause': 5294,
 'server': 23119,
 'start': 24542,
 'troubles': 26589,
 'friends': 10844,
 'complain

In [26]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_cv,y_train)

In [27]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test_cv)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79      4453
           1       0.74      0.80      0.77      4193
           2       0.84      0.67      0.74      3635
           3       0.85      0.65      0.74      2518

    accuracy                           0.76     14799
   macro avg       0.79      0.75      0.76     14799
weighted avg       0.77      0.76      0.76     14799



In [28]:
email = [""]
val = cv.transform(email)
x = model.predict(val)
if x == 1:
    print("Positive") 
elif x == 0:
    print("Negative")
elif x == 2:
    print("Neutral")
else:
    print("Irrelavent")

Negative


# TF - IDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
v = TfidfVectorizer(stop_words="english")
val = v.fit_transform(data["para"])
for i in v.vocabulary_:
    index = v.vocabulary_.get(i)
    print(i," | ",v.idf_[index])


coming  |  6.04634846243986
borders  |  9.646816959209742
kill  |  6.198051160628478
im  |  5.708976270755656
getting  |  5.080067806204369
borderlands  |  4.904564001906541
murder  |  7.949086439629964
spent  |  6.964742244510793
hours  |  5.907317514249298
making  |  5.770819776038359
fun  |  4.883985778249649
don  |  4.442810272132947
know  |  4.779282508754161
huge  |  6.185900342845965
fan  |  6.403623826690836
maya  |  8.915929450666951
favorite  |  5.8470155598193685
characters  |  6.77804431311704
decided  |  6.813603615153527
make  |  4.824676081014523
wallpaper  |  9.50371611556907
pc  |  5.62283983913776
original  |  6.773687007748085
image  |  7.921306875522888
versus  |  9.26732733750484
creation  |  10.132324774991444
enjoy  |  6.082716106610735
pic  |  3.8627551184952775
twitter  |  3.8751362290341334
com  |  3.02602606156333
mlsi5wf9jg  |  10.602328404237179
couple  |  7.149171283644313
doing  |  5.605116130473064
picture  |  7.449592381873524
compared  |  7.88103297638

In [3]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.11.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp312-cp312-win_amd64.whl (98 kB)
   ---------------------------------------- 0.0/98.8 kB ? eta -:--:--
   ----------------------------- ---------- 71.7/98.8 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 98.8/98.8 kB 1.4 MB/s eta 0:00:00
Downloading rapidfuzz-3.11.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.6 MB 6.5 MB/s eta 0:00:01
   -------------------- -----------------

In [4]:
pip install --upgrade scipy gensim

Collecting scipy
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     -------------------------------- ----- 51.2/60.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 459.4 kB/s eta 0:00:00
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 3.8 MB/s eta 0:00:07
    --------------------------------------- 0.3/24.0 MB 3.8 MB/s eta 0:00:07
    --------------------------------------- 0.5/24.0 MB 4.3 MB/s eta 0:00:06
   - -------------------------------------- 0.7/24.0 MB 4.2 MB/s eta 0:00:06
   - -------------------------------------- 0.9/24.0 MB 4.3 MB/s eta 0:00:06
   

In [6]:
pip install --upgrade cython

Collecting cython
  Downloading Cython-3.0.11-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Downloading Cython-3.0.11-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB 330.3 kB/s eta 0:00:09
   -- ------------------------------------- 0.1/2.8 MB 1.1 MB/s eta 0:00:03
   ---- ----------------------------------- 0.3/2.8 MB 2.0 MB/s eta 0:00:02
   -------- ------------------------------- 0.6/2.8 MB 2.9 MB/s eta 0:00:01
   ------------ --------------------------- 0.8/2.8 MB 3.4 MB/s eta 0:00:01
   ------------- -------------------------- 1.0/2.8 MB 3.2 MB/s eta 0:00:01
   ----------------- ---------------------- 1.2/2.8 MB 3.3 MB/s eta 0:00:01
   --------------------- ------------------ 1.5/2.8 MB 3.7 MB/s eta 0:00:01
   -------------------------- ------------- 1.9/2.8 MB 4.1 MB/s eta 0:00:01
   ----------------------

In [4]:
import pandas as pd
import gensim

In [25]:
df = pd.read_csv("reviews_data.csv")
df.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...


In [26]:
df["Review"][0]

'Amber and LaDonna at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available.'

In [27]:
df = df["Review"]

In [28]:
review_text = df.apply(gensim.utils.simple_preprocess)
review_text

0      [amber, and, ladonna, at, the, starbucks, on, ...
1      [at, the, starbucks, by, the, fire, station, o...
2      [just, wanted, to, go, out, of, my, way, to, r...
3      [me, and, my, friend, were, at, starbucks, and...
4      [on, this, kick, of, drinking, cups, of, warm,...
                             ...                        
845    [ordered, two, venti, frappacino, without, whi...
846                                   [no, review, text]
847    [demanded, tips, from, me, then, made, me, wai...
848                                   [no, review, text]
849                                   [no, review, text]
Name: Review, Length: 850, dtype: object

In [29]:
model = gensim.models.Word2Vec(
    window = 10, #how many works before or after the missing value 
    min_count= 2, #min 2 words in sentences
    workers=4  # how many cpu threads should be used in the system
)

# Model.epochs is an attribute that indicates how many times the training process will go through the entire dataset. The default value for epochs is usually set to 5 in Gensim's Word2Vec implementation.

In [44]:
model.build_vocab(review_text,progress_per=1000)

In [45]:
model.epochs

5

In [46]:
model.corpus_count

850

In [47]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(241402, 343960)

In [50]:
model.wv.most_similar("expensive")

[('appreciate', 0.9994852542877197),
 ('buy', 0.999445378780365),
 ('station', 0.9994432330131531),
 ('as', 0.9994262456893921),
 ('need', 0.9994183778762817),
 ('seem', 0.9994158148765564),
 ('staff', 0.9994157552719116),
 ('product', 0.9994111657142639),
 ('closest', 0.9994096755981445),
 ('better', 0.9994089007377625)]

In [49]:
model.wv.similarity(w1 = "bad",w2 = "good")

0.9987057