In [1]:

## credit to PradipNichite https://github.com/PradipNichite/Youtube-Tutorials/blob/main/Youtube_Text_Classification_using_sentence_embedding_.ipynb
import pandas as pd
import numpy as np
np.random.seed(2024)
     

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
     


  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [39]:
data = pd.read_csv('bigger_ads_dataset.csv')
data.dropna(subset=['sentences'], inplace=True)


In [3]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [40]:
cols = list(data.columns.values)
print(cols)
print(data['sentences'])

['sentences', 'ads']
0        You'll never guess what we're doing to the Bac...
1                        We're putting it on the Baconator
2                               Katherine, you seeing this
3                                      Postures everywhere
4                                                     What
                               ...                        
18166    All the other kids are at something called the...
18167                                       I'm sorry gang
18168                                            I blew up
18169              I hate it when grown-ups call kids gang
18170                            Don't worry about it, Mom
Name: sentences, Length: 18165, dtype: object


In [28]:
def spacy_tokenizer(sentence):
    if pd.isna(sentence):
        return
    else:
        doc = nlp(sentence)
        mytokens = [ word.lemma_.lower().strip() for word in doc ] 
        mytokens = [ word for word in mytokens if word not in punctuations ] 
        sentence = " ".join(mytokens) # return preprocessed list of tokens return sentence
        return sentence

In [41]:
data['tokenize'] = data['sentences'].apply(spacy_tokenizer)

In [42]:
print(len(data['tokenize']))

18165


In [43]:
data['embeddings'] = data['tokenize'].apply(model.encode)

In [44]:
print(data['embeddings'])

0        [-0.043664698, 0.014645079, 0.060612295, -0.00...
1        [-0.06298281, 0.03659228, 0.013885124, -0.0020...
2        [-0.0006743414, -0.012383045, 0.05902087, 0.02...
3        [0.006550763, 0.023429932, -0.004073272, -0.02...
4        [-0.105645, 0.0765896, -0.05577093, -0.0026801...
                               ...                        
18166    [0.003767649, 0.026861528, -0.015800757, -0.10...
18167    [-0.024565972, 0.026038438, 0.0015804166, -0.0...
18168    [-0.034687307, -0.0020050304, 0.005607404, -0....
18169    [-0.06114863, 0.029292548, -0.030165982, -0.04...
18170    [0.03657346, 0.020573333, 0.013142387, -0.0135...
Name: embeddings, Length: 18165, dtype: object


In [45]:
X = data['embeddings'].to_list() 
Y = data['ads'].to_list()

In [11]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,stratify=Y)

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train, Y_train)
LR = LogisticRegression()
LR.fit(X_train,Y_train)
gnb = GaussianNB()
gnb.fit(X_train,Y_train)

In [56]:
from sklearn import metrics
predicted = LR.predict(X_test)
gnb_predicted = gnb.predict(X_test)
svm_predicted = svm.predict(X_test)
print(len(predicted))
print(len(gnb_predicted))
print(len(svm_predicted))
print("Logistic Regression Accuracy:",metrics.accuracy_score(Y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(Y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(Y_test, predicted))
print("Naive Bayes Accuracy:",metrics.accuracy_score(Y_test, gnb_predicted))
print("Naive Bayes Precision:",metrics.precision_score(Y_test, gnb_predicted))
print("Naive Bayes Recall:",metrics.recall_score(Y_test, gnb_predicted))
print("Support Vector Machine Accuracy:",metrics.accuracy_score(Y_test, svm_predicted))
print("Support Vector Machine Precision:",metrics.precision_score(Y_test, svm_predicted))
print("Support Vector Machine Recall:",metrics.recall_score(Y_test, svm_predicted))

3633
3633
3633
Logistic Regression Accuracy: 0.8260390861546931
Logistic Regression Precision: 0.7728937728937729
Logistic Regression Recall: 0.45376344086021503
Naive Bayes Accuracy: 0.7475915221579962
Naive Bayes Precision: 0.5058717253839206
Naive Bayes Recall: 0.6021505376344086
Support Vector Machine Accuracy: 0.8786127167630058
Support Vector Machine Precision: 0.9054726368159204
Support Vector Machine Recall: 0.5870967741935483


In [57]:
from joblib import dump, load

In [58]:
dump(svm, 'SVM_big_dataset.joblib')

['SVM_big_dataset.joblib']

In [177]:
Z = 'I like pineapple'

In [178]:
Z_t = spacy_tokenizer(Z)

In [179]:
print(Z_t)

i like pineapple


In [180]:
Z_e = model.encode(Z_t)

In [160]:
print(Z_e)

[ 6.04108199e-02  7.26522952e-02 -1.13935418e-01 -5.11645805e-03
 -5.16066793e-03  6.46596998e-02  9.37615559e-02  3.18828896e-02
  2.10768376e-02  4.61650640e-02  2.23534759e-02 -1.47739174e-02
 -2.38701422e-02  4.21287753e-02  7.44694918e-02  5.24458196e-03
 -3.75998579e-02  7.47641325e-02 -5.58120497e-02 -5.32653853e-02
 -2.39933878e-02  2.09420342e-02  9.83796641e-03 -1.12856831e-02
  5.99561222e-02 -5.51436096e-02  3.39362733e-02 -1.43808043e-02
  2.98099238e-02 -7.20173717e-02  6.50294572e-02 -9.37862322e-02
  5.47353290e-02  3.10633201e-02 -1.09633781e-01  1.16090458e-02
  1.80180501e-02  2.80515105e-02  2.77114868e-05  1.37383882e-02
  5.15031554e-02 -1.15676232e-01 -5.82098588e-02  1.48304421e-02
 -1.92370657e-02 -1.18252756e-02 -8.97149593e-02 -1.94422193e-02
  4.98235151e-02  5.58403321e-02 -6.50079921e-02  6.78175390e-02
 -5.20174131e-02  3.69712971e-02  5.95720932e-02  8.02378133e-02
  1.22813936e-02  4.32213135e-02  5.85606843e-02 -8.45031738e-02
 -1.24439248e-03 -3.24971

In [67]:
print(X_test[0])

[-5.07641174e-02 -2.34419778e-02 -6.27336502e-02  6.63300753e-02
 -8.74017328e-02  2.07629558e-02  1.25100568e-01 -2.24401075e-02
 -5.38121723e-03  1.22526137e-03  2.17367485e-02 -1.02170497e-01
  3.03389337e-02 -8.33508819e-02  4.19152156e-02 -4.08439711e-02
  7.51090571e-02 -4.84239422e-02 -4.35671099e-02 -2.99307704e-02
 -1.18160278e-01 -6.63108975e-02 -5.00919372e-02  9.16730799e-03
 -7.40199210e-03  5.05779870e-03 -4.00118355e-04  4.60644886e-02
 -6.96590170e-02 -4.16165814e-02  6.35357923e-04 -1.29817901e-02
  1.40212232e-03 -4.23484202e-03  3.39565463e-02  2.59733852e-02
 -4.44669127e-02  6.21782988e-03 -5.46718985e-02  3.26824412e-02
 -4.54000980e-02 -7.00153485e-02  5.37706874e-02  5.23056611e-02
 -3.17069478e-02  1.20585278e-01 -8.25774111e-03  2.90915500e-02
  4.47845012e-02 -5.32956682e-02 -1.00350389e-02 -4.47632894e-02
  8.88143405e-02  1.16144782e-02  6.93624886e-03  8.08993354e-02
 -1.63557064e-02  2.03879010e-02  2.57349033e-02 -5.88994361e-02
  1.66438869e-03 -2.68531

In [1]:
X_test_reshaped = Z_e.reshape(1, -1)  # or reshape(-1, 1) if you want a column vector

# Predict the class label for the reshaped input
predicted_label = LR.predict(X_test_reshaped)
print(predicted_label)

NameError: name 'Z_e' is not defined