### Label Encoder

In [29]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

categories = ['teacher', 'nurse', 'police','doctor']
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(categories)
df = pd.DataFrame({'Meslek': categories, 'Etiket':encoded_labels})

df.head()

Unnamed: 0,Meslek,Etiket
0,teacher,3
1,nurse,1
2,police,2
3,doctor,0


### One-hot Encoder

In [38]:
from sklearn.preprocessing import OneHotEncoder

categories = ['teacher', 'nurse', 'police','doctor']
data = pd.DataFrame({'Meslek': categories})
encoder = OneHotEncoder(sparse_output= False,dtype =int)
encoded_data = encoder.fit_transform(data)
encoded_df   = pd.DataFrame(encoded_data, columns = categories)

encoded_df.head()

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,teacher,nurse,police,doctor
0,0,0,0,1
1,0,1,0,0
2,0,0,1,0
3,1,0,0,0


### TF-IDF

In [31]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

documents = ['Bu ilk belgedir',
            'İkinci belge budur',
            'Ve ücüncü belgemiz',
            'İlk belge hangisidir']

data = pd.DataFrame({'Text': documents})
vectorizer = TfidfVectorizer()
tfidf_vector = vectorizer.fit_transform(data['Text'])

tfidf_vector = pd.DataFrame(tfidf_vector.toarray(), columns= vectorizer.get_feature_names_out())

tfidf_vector.head()

import nltk

nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

from nltk.tokenize import word_tokenize

from sklearn.decomposition import NMF, PCA

from sklearn.manifold import TSNE

import spacy

text = "Doğal dil işleme bilgisayar bilimi altalanıdır" \
        "yapay zeka ve hasaplamalı bilmidir" \
        "Bilgisayar ve insan dili kesişimidir"

tokens = word_tokenize(text)

print(len(text),tokens[:50])

# TF-IDF
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform([text])
print('TF-IDF:')
print(tfidf_vec.get_feature_names_out()[:50])
print(X_tfidf.toarray()[0][:50])

116 ['Doğal', 'dil', 'işleme', 'bilgisayar', 'bilimi', 'altalanıdıryapay', 'zeka', 've', 'hasaplamalı', 'bilmidirBilgisayar', 've', 'insan', 'dili', 'kesişimidir']
TF-IDF:
['altalanıdıryapay' 'bilgisayar' 'bilimi' 'bilmidirbilgisayar' 'dil'
 'dili' 'doğal' 'hasaplamalı' 'insan' 'işleme' 'kesişimidir' 've' 'zeka']
[0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.5  0.25]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emirr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Bag of words

In [32]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

documents = ['Bu ilk belgedir',
            'İkinci belge budur',
            'Ve ücüncü belgemiz',
            'İlk belge hangisidir']

data = pd.DataFrame({'Text':documents})
vectorizer = CountVectorizer()
bow_vectors = vectorizer.fit_transform(data['Text'])

bow_df = pd.DataFrame(bow_vectors.toarray(), columns = vectorizer.get_feature_names_out())

bow_df.head()

Unnamed: 0,belge,belgedir,belgemiz,bu,budur,hangisidir,ilk,kinci,lk,ve,ücüncü
0,0,1,0,1,0,0,1,0,0,0,0
1,1,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,1,1
3,1,0,0,0,0,1,0,0,1,0,0


### Cbow

In [33]:
import pandas as pd
from gensim.models import Word2Vec

sentences = [["Ben", "severim", "elmaları"],
            ["Ben", "yerim", "meyve"],
            ['elmalar', "lezzetlidir"],
            ['meyveler','sağlar','vitamin']]

cbow = Word2Vec(sentences,min_count=1 , vector_size=300,sg=0)
vectors = cbow.wv

vector_df = pd.DataFrame(vectors.vectors, index = vectors.index_to_key)

vector_df.head(5)




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Ben,-0.000179,7.9e-05,0.001701,0.003003,-0.003101,-0.002372,0.002153,0.002991,-0.001672,-0.001254,...,-0.001503,0.001901,0.00306,-0.001367,0.002655,0.001792,0.00196,0.000171,0.002738,-0.00234
vitamin,-0.002748,0.0031,-6.6e-05,-0.000656,0.001535,-0.001365,0.000914,0.002313,0.002022,-0.002504,...,0.003024,0.002979,-0.002736,-0.001004,0.003296,0.001701,-0.000529,-0.002897,0.000987,-0.002225
sağlar,0.002711,-0.001486,-0.000356,0.000335,-6.4e-05,0.000383,0.002038,-7e-06,-0.001082,-0.000504,...,-0.001695,0.000377,0.000961,-0.000512,0.003311,0.002783,0.000805,0.002373,0.001964,-0.00186
meyveler,-0.001719,-0.002223,-0.002592,0.00277,-0.000661,-0.002285,-0.001385,0.001715,-0.000956,-0.00125,...,0.001591,-0.001087,-0.003089,0.001262,0.002387,-0.001878,-0.002622,-0.000991,-0.001644,-0.000772
lezzetlidir,-0.000648,-0.001756,0.003149,-0.0031,0.001501,0.001801,-0.00047,0.003002,0.003295,-0.001825,...,0.002365,0.000634,0.001733,0.002127,0.000637,-0.002043,-2e-06,0.002756,-0.002033,0.003146


### Skip gram

In [34]:
import pandas as pd
from gensim.models import Word2Vec

sentences = [["Ben", "severim", "elmaları"],
            ["Ben", "yerim", "meyve"],
            ['elmalar', "lezzetlidir"],
            ['meyveler','sağlar','vitamin']]

skip_gram = Word2Vec(sentences,min_count=1 , vector_size=300,sg=1)
vectors = skip_gram.wv

vector_df = pd.DataFrame(vectors.vectors, index = vectors.index_to_key)

vector_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Ben,-0.000179,7.9e-05,0.001701,0.003003,-0.003101,-0.002372,0.002153,0.002991,-0.001672,-0.001254,...,-0.001503,0.001901,0.00306,-0.001367,0.002655,0.001792,0.00196,0.000171,0.002738,-0.00234
vitamin,-0.002748,0.0031,-6.6e-05,-0.000656,0.001535,-0.001365,0.000914,0.002313,0.002022,-0.002504,...,0.003024,0.002979,-0.002736,-0.001004,0.003296,0.001701,-0.000529,-0.002897,0.000987,-0.002225
sağlar,0.002711,-0.001486,-0.000356,0.000335,-6.4e-05,0.000383,0.002038,-7e-06,-0.001082,-0.000504,...,-0.001695,0.000377,0.000961,-0.000512,0.003311,0.002783,0.000805,0.002373,0.001964,-0.00186
meyveler,-0.001719,-0.002223,-0.002592,0.00277,-0.000661,-0.002285,-0.001385,0.001715,-0.000956,-0.00125,...,0.001591,-0.001087,-0.003089,0.001262,0.002387,-0.001878,-0.002622,-0.000991,-0.001644,-0.000772
lezzetlidir,-0.000648,-0.001756,0.003149,-0.0031,0.001501,0.001801,-0.00047,0.003002,0.003295,-0.001825,...,0.002365,0.000634,0.001733,0.002127,0.000637,-0.002043,-2e-06,0.002756,-0.002033,0.003146


### N-gram

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

documents = ["Bu ilk belgedir.",
             "İkinci belge budur.",
             "Ve üçüncü belgemiz.",
             "İlk belge hangisidir?"]

data = pd.DataFrame({'Text': documents})
ngram = CountVectorizer(ngram_range=(2,3))
ngram_vectors = ngram.fit_transform(data['Text'])
ngram_df = pd.DataFrame(ngram_vectors.toarray(), columns=ngram.get_feature_names_out())

ngram_df.head()

Unnamed: 0,belge budur,belge hangisidir,bu ilk,bu ilk belgedir,ilk belgedir,kinci belge,kinci belge budur,lk belge,lk belge hangisidir,ve üçüncü,ve üçüncü belgemiz,üçüncü belgemiz
0,0,0,1,1,1,0,0,0,0,0,0,0
1,1,0,0,0,0,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,1,1
3,0,1,0,0,0,0,0,1,1,0,0,0
