# Sentiment Analizi ve Sınıflandırma Modelleri

In [None]:
!pip install keras

In [None]:
!pip install tensorflow

In [None]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.preprocessing import LabelEncoder

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import pandas as pd 
data = pd.read_csv("train.tsv",sep = "\t")

In [None]:
data.head()

In [None]:
data["Sentiment"].replace([0,1], value = "negatif", inplace = True)

In [None]:
data["Sentiment"].replace([3,4], value = "pozitif", inplace = True)

In [None]:
data.head()

In [None]:
data = data[(data.Sentiment == "negatif") | (data.Sentiment == "pozitif")]

In [None]:
data.head()

In [None]:
data.groupby("Sentiment").count()

In [None]:
df = pd.DataFrame()
df["text"] = data["Phrase"]
df["label"] = data["Sentiment"]

In [None]:
df.head()

## Metin Ön İşleme

In [None]:
#buyuk-kucuk donusumu
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#noktalama işaretleri
df['text'] = df['text'].str.replace('[^\w\s]','')
#sayılar
df['text'] = df['text'].str.replace('\d','')
#stopwords
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
#seyreklerin silinmesi
sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:]
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
#lemmi
from textblob import Word
#nltk.download('wordnet')
df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

## Değişken Mühendisliği

* Count Vectors
* TF-IDF Vectors (words, characters, n-grams)
* Word Embeddings

TF(t) = (Bir t teriminin bir dökümanda gözlenme frekansı) / (dökümandaki toplam terim sayısı)

IDF(t) = log_e(Toplam döküman sayısı / içinde t terimi olan belge sayısı)


In [None]:
df.head()

In [None]:
df.iloc[0]

## Test-Train

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["text"],
                                                                   df["label"],
                                                                    random_state = 1)

In [None]:
encoder=LabelEncoder()
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [None]:
train_y[0:5]

In [None]:
test_y[0:5]

## Count Vectors

In [None]:
vectorizer=CountVectorizer()
vectorizer.fit(train_x)


In [None]:
x_train_count=vectorizer.transform(train_x)
x_test_count=vectorizer.transform(test_x)

In [None]:
x_train_count.head()

In [None]:
vectorizer.get_feature_names()[0:5]

In [None]:
x_train_count.toarray()

### TF-IDF

#### Word Level

In [None]:
tf_idf_word_vectorizer=TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

In [None]:
x_train_tf_idf_word=tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word=tf_idf_word_vectorizer.transform(test_x)

In [None]:
x_train_count.toarray()

In [None]:
x_train_tf_idf_word.toarray()

### Ngram level tf-idf

In [None]:
tf_idf_word_vectorizer=TfidfVectorizer(ngram_range=(2,3))
tf_idf_word_vectorizer.fit(train_x)

In [None]:
x_train_tf_idf_ngram=tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_ngram=tf_idf_word_vectorizer.transform(test_x)

#### Characters Level tf-idf

In [None]:
tf_idf_chars_vectorizer=TfidfVectorizer(analyzer="char",ngram_range=(2,3))
tf_idf_chars_vectorizer.fit(train_x)

In [None]:
x_train_tf_idf_chars=tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars=tf_idf_chars_vectorizer.transform(test_x)

## Makine öğrenmesi ile Sentiment Sınıflandırılması

## Lojistik Regresyon

In [54]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_count, train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                           x_test_count,
                                           test_y,
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.8368200836820083


In [55]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.8331589958158995


In [56]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.748326359832636


In [57]:
loj = linear_model.LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(loj_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.7811715481171548


## Naive Bayes

In [58]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_count,
                                           test_y,
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.8332112970711296


In [59]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.835041841004184


In [60]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.7685146443514643


In [61]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(nb_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.7557008368200837


## Random Forests

In [62]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_count,
                                           test_y,
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.8232740585774059


In [63]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.8234832635983264


In [64]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.748326359832636


In [65]:
rf = ensemble.RandomForestClassifier()
rf_model = loj.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(rf_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.7811715481171548


## XGBoost

In [66]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_count,
                                           test_y,
                                           cv = 10).mean()

print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.7153242677824267


In [67]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_word,
                                           test_y,
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.7080020920502091


In [68]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_ngram,
                                           test_y,
                                           cv = 10).mean()

print("N-GRAM TF-IDF Doğruluk Oranı:", accuracy)

N-GRAM TF-IDF Doğruluk Oranı: 0.5827928870292888


In [69]:
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(x_train_tf_idf_chars,train_y)
accuracy = model_selection.cross_val_score(xgb_model,
                                           x_test_tf_idf_chars,
                                           test_y,
                                           cv = 10).mean()

print("CHARLEVEL Doğruluk Oranı:", accuracy)

CHARLEVEL Doğruluk Oranı: 0.7783472803347281


In [70]:
loj_model

LogisticRegression()

In [71]:
loj_model.predict("yes ı love it")

ValueError: Expected 2D array, got scalar array instead:
array=yes ı love it.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [73]:
yeni_yorum=pd.Series("This film is very nice and good ı like it")

yeni_yorum=pd.Series("no not good look at that shit very bad")

In [78]:
v=CountVectorizer()
v.fit(train_x)
yeni_yorum=v.transform(yeni_yorum)
loj_model.predict(yeni_yorum)

AttributeError: lower not found