## Pandas

In [1]:
import pandas as pd

## Stopwords

In [2]:
!pip install nltk



In [3]:
import nltk

In [4]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords

## Tokenization

In [6]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
!pip install textblob



In [8]:
import textblob

In [9]:
from textblob import TextBlob

## Stemming

In [10]:
from nltk.stem import PorterStemmer
st = PorterStemmer()

## Lemmatization

In [11]:
from textblob import Word
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# NLP Uygulamaları

## Part of speech tagging (POS)

In [12]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Named Entity Recognition

In [13]:
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\berki\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

# Sentiment Analizi ve Sınıflandırma Modelleri

In [14]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers


from warnings import filterwarnings
filterwarnings('ignore')

In [15]:
import pandas as pd 
data = pd.read_csv("train.tsv",sep = "\t")

In [16]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [17]:
data["Sentiment"].replace(0, value = "negatif", inplace = True)
data["Sentiment"].replace(1, value = "negatif", inplace = True)

In [18]:
data["Sentiment"].replace(3, value = "pozitif", inplace = True)
data["Sentiment"].replace(4, value = "pozitif", inplace = True)

In [19]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negatif
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [20]:
data = data[(data.Sentiment == "negatif") | (data.Sentiment == "pozitif")]

In [21]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,negatif
21,22,1,good for the goose,pozitif
22,23,1,good,pozitif
33,34,1,"the gander , some of which occasionally amuses...",negatif
46,47,1,amuses,pozitif


In [22]:
data.groupby("Sentiment").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negatif,34345,34345,34345
pozitif,42133,42133,42133


In [23]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]

In [24]:
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,negatif
21,good for the goose,pozitif
22,good,pozitif
33,"the gander , some of which occasionally amuses...",negatif
46,amuses,pozitif


## Metin Ön İşleme

In [25]:
#buyuk-kucuk donusumu
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#noktalama işaretleri
df['text'] = df['text'].str.replace('[^\w\s]','')
#sayılar
df['text'] = df['text'].str.replace('\d','')
#stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
#seyreklerin silinmesi
sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

## Değişken Mühendisliği

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings

TF(t) = (Bir t teriminin bir dökümanda gözlenme frekansı) / (dökümandaki toplam terim sayısı) 

IDF(t) = log_e(Toplam döküman sayısı / içinde t terimi olan belge sayısı)


In [26]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,negatif
21,good goose,pozitif
22,good,pozitif
33,gander occasionally amuses none amount much story,negatif
46,amuses,pozitif


In [27]:
df.iloc[0]

text     series demonstrating adage good goose also goo...
label                                              negatif
Name: 0, dtype: object

## Test-Train

In [28]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"], 
                                                                    random_state = 1)

In [29]:
train_y[0:5]

118788    pozitif
89514     negatif
86857     pozitif
140626    negatif
153243    pozitif
Name: label, dtype: object

In [30]:
encoder = preprocessing.LabelEncoder()

In [31]:
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [32]:
train_y[0:5]

array([1, 0, 1, 0, 1])

In [33]:
test_y[0:5]

array([1, 0, 1, 0, 0])

### Count Vectors

In [34]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)

In [35]:
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)

In [36]:
#x_train_count.head()

In [37]:
#feature_names = vectorizer.get_feature_names()
#print(feature_names[:5])

In [38]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### TF-IDF

In [39]:
#wordlevel

In [40]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

In [41]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [42]:
#tf_idf_word_vectorizer.get_feature_names()[0:5]

In [43]:
x_train_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
# ngram level tf-idf

In [45]:
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range = (2,3))
tf_idf_ngram_vectorizer.fit(train_x)

In [46]:
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

In [47]:
# characters level tf-idf

In [48]:
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range = (2,3))
tf_idf_chars_vectorizer.fit(train_x)

In [49]:
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

# Makine Öğrenmesi ile Sentiment Sınıflandırması

## Lojistik Regresyon

In [50]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_count, 
                                           test_y, 
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.8368200836820083


In [51]:
loj_model

In [52]:
#loj_model.predict("yes i like this film")

In [57]:
positive_new_comment = pd.Series("this film is very nice and good i like it")

negative_new_comment = pd.Series("no not good look at that shit very bad")

In [58]:
v = CountVectorizer()
v.fit(train_x)
positive_new_comment = v.transform(positive_new_comment)
negative_new_comment = v.transform(negative_new_comment)

In [59]:
loj_model.predict(positive_new_comment)

array([1])

In [60]:
loj_model.predict(negative_new_comment)

array([0])

## User App

In [64]:
#You can give a comment and test yourself.

In [None]:
new_comment = input("Please enter your comment.")
new_comment = pd.Series(comment)
new_comment = v.transform(new_comment)
loj_model.predict(new_comment)