In [None]:
'''
This is the definition of the notebook
'''

In [None]:
%reload_ext autoreload
%autoreload 2

## Importing Dataset and Libraries

In [None]:
'''
Required libraries. Some of them may not be required.
'''

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import matplotlib
import pandas as pd #req.
import numpy as np
import nltk
import sys
import xlsxwriter
import re
import string
from sqlalchemy import create_engine
import time
import logging
import string
import emoji
import seaborn as sns
from collections import Counter, defaultdict
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords, wordnet # req.
from sklearn import model_selection, naive_bayes, svm
from sklearn.decomposition import LatentDirichletAllocation, NMF
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

from zemberek import ( #req.
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

import zeyrek

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import confusion_matrix, classification_report
from simpletransformers.classification import ClassificationModel
from sklearn.naive_bayes import BernoulliNB


pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
'''# Credentials to database connection
hostname="***"
dbname="***"
uname="***"
pwd="***"

# Create SQLAlchemy engine to connect to MySQL Database
engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}".format(host=hostname, db=dbname, user=uname, pw=pwd))

q="SELECT * FROM alltwitter"

training=pd.read_sql_query(q, engine)

df=df[['Tweet ID','Tweet','Time']]

'''

In [None]:
# Loading the train and test data for visualization & exploration.
data=pd.read_csv('tvseries_full.csv', delimiter=";", encoding='utf-8')
#trainv = pd.read_csv('training_set.csv', delimiter=";", encoding='utf-8')
#testv = pd.read_csv('test_set.csv', delimiter=";", encoding='utf-8')

In [None]:
display(data.sample(20))

In [None]:
data.Lab.value_counts()

In [None]:
df = data[data.Lab != 'Junk']

In [None]:
df.Lab.value_counts()

## Data Preprocessing

##### Removing Mentions, Punctuations, HTMLs, Hyperlinks and Hashtags

In [None]:
df['Content'] = df['Content'].replace('@[A-Za-z0-9]+', '', regex=True).replace('@[A-Za-z0-9]+', '', regex=True)
display(df.head(20))

In [None]:
df['Content']  = df['Content'] .replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
display(df.head(20))

In [None]:
df['Content']  = df['Content'] .replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
df['Content']  = df['Content'] .replace('#[A-Za-z0-9ğüşöçıİĞÜŞÖÇ]+', 
                                  '', regex=True).replace('#[A-Za-z0-9ğüşöçıİĞÜŞÖÇ]+', '', regex=True)

display(df.head(20))

In [None]:
df['Content'] = df['Content'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)

In [None]:
df.head(20)

In [None]:
#repeated letters and spaces
ss=df['Content'] .copy()
new_st = []
i = 0
for k in ss:
    new_ss = re.sub(r'([A-Za-z0-9ğüşöçıİĞÜŞÖÇ])\1+', r'\1',k)
    new_st.append(new_ss)
    i += 1

df['Content'] = new_st

df['Content']  = df['Content'].str.replace('[^\w\s]','')
    
df.head(20)

#### Tokenization

In [None]:
# tokenization

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()

In [None]:
table = str.maketrans('', '', string.punctuation)
ss2 = df['Content'] .copy()
new_st2 = []
for k in ss2:
    words = k.split()
    stripped = [w.translate(table) for w in words]
    new_st2.append(stripped)
    
df['tokenized'] = new_st2
df.head()

##### Converting to Lowercase

In [None]:
df['lower'] = df['tokenized'].apply(
    lambda x: [word.lower() for word in x])

df.head()

In [None]:
#stopwords

stops = set(stopwords.words('turkish'))
print(stops)

In [None]:
df['stopwords_removed'] = df['lower'].apply(
    lambda x: [word for word in x if word not in stops])

df.head()

In [None]:
# Applying part of speech tags.

df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)

df.head()

In [None]:
# Converting part of speeches to wordnet format.

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


df['wordnet_pos'] = df['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

df.head()

In [None]:
df['wordnet_pos'].values

In [None]:
#  normalization

def normalize_long_text(text):
    normalized_sentences = [normalizer.normalize(word) for word in text]
    normalized_text = " ".join(normalized_sentences)
    return normalized_text

In [None]:
sentences = df['tokenized'].copy()
new_sent = []
start = time.time()

for token in sentences:   
    if token.count('') > 0:
        token = list(filter(('').__ne__, token))
    new_token = normalize_long_text(token)
    new_sent.append(new_token)

logger.info(f"Sentences normalized in: {time.time() - start} s")

In [None]:
splitted_words = []
for sent in new_sent:
    words = sent.split()
    splitted_words.append(words)
    
for token in splitted_words:
    j = 0
    for word in token:
        new_word = word.replace('"', '').replace("’", '').replace("'", '').replace("”", '')
        token[j] = new_word
        j += 1
        
        
# Zeyrek for lemmatization

analyzer = zeyrek.MorphAnalyzer()
lem_sent = []
for sent in splitted_words:
    normalized_sent = []
    for word in sent:
        if word == '':
            continue
        else:
            lem_word = analyzer.lemmatize(word)
            normalized_sent.append(lem_word[0][1][0])
    lem_sent.append(normalized_sent)
    
x = lem_sent.copy()
for sent in x:
    i = 0
    for token in sent:
        sent[i] = token.lower()
        i += 1
lem_sent = x


lem_sent = list(filter(('').__ne__, lem_sent))

df['lemmatized'] = lem_sent

In [None]:
df['lemma_str'] = [' '.join(map(str, l)) for l in df['lemmatized']]

In [None]:
df.head()

In [None]:
df['Lab'].value_counts()

In [None]:
# Displaying target distribution.

fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 6), dpi=100)
sns.countplot(df['Lab'], ax=axes[0])
axes[1].pie(df['Lab'].value_counts(),
            labels=['Positive', 'Neutral','Negative','Neutral-Negative'],
            autopct='%1.2f%%',
            shadow=True,
            explode=(0.05, 0, 0,0),
            startangle=60)
fig.suptitle('Distribution of the Tweets', fontsize=24)
plt.show()

In [None]:
df['Character Count'] = df['Content'].apply(lambda x: len(str(x)))


def plot_dist3(df, feature, title):
    # Creating a customized chart. and giving in figsize and everything.
    fig = plt.figure(constrained_layout=True, figsize=(18, 8))
    # Creating a grid of 3 cols and 3 rows.
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)

    # Customizing the histogram grid.
    ax1 = fig.add_subplot(grid[0, :2])
    # Set the title.
    ax1.set_title('Histogram')
    # plot the histogram.
    sns.distplot(df.loc[:, feature],
                 hist=True,
                 kde=True,
                 ax=ax1,
                 color='#e74c3c')
    ax1.set(ylabel='Frequency')
    ax1.xaxis.set_major_locator(MaxNLocator(nbins=20))

    # Customizing the ecdf_plot.
    ax2 = fig.add_subplot(grid[1, :2])
    # Set the title.
    ax2.set_title('Empirical CDF')
    # Plotting the ecdf_Plot.
    sns.distplot(df.loc[:, feature],
                 ax=ax2,
                 kde_kws={'cumulative': True},
                 hist_kws={'cumulative': True},
                 color='#e74c3c')
    ax2.xaxis.set_major_locator(MaxNLocator(nbins=20))
    ax2.set(ylabel='Cumulative Probability')

    # Customizing the Box Plot.
    ax3 = fig.add_subplot(grid[:, 2])
    # Set title.
    ax3.set_title('Box Plot')
    # Plotting the box plot.
    sns.boxplot(x=feature, data=df, orient='v', ax=ax3, color='#e74c3c')
    ax3.yaxis.set_major_locator(MaxNLocator(nbins=25))

    plt.suptitle(f'{title}', fontsize=24)

In [None]:
def plot_word_number_histogram(textneg, textpos, textneu,textneune):
    
    """A function for comparing word counts"""

    fig, axes = plt.subplots(ncols=4, nrows=1, figsize=(18, 6), sharey=True)
    sns.distplot(textneg.str.split().map(lambda x: len(x)), ax=axes[0], color='#e74c3c')
    sns.distplot(textpos.str.split().map(lambda x: len(x)), ax=axes[1], color='#e74c3c')
    sns.distplot(textneu.str.split().map(lambda x: len(x)), ax=axes[2], color='#e74c3c')
    sns.distplot(textneu.str.split().map(lambda x: len(x)), ax=axes[3], color='#e74c3c')
    
    axes[0].set_xlabel('Word Count')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Negative Tweets')
    axes[1].set_xlabel('Word Count')
    axes[1].set_title('Positive Tweets')
    axes[2].set_xlabel('Word Count')
    axes[2].set_title('Neutral Tweets')
    axes[3].set_xlabel('Word Count')
    axes[3].set_title('Neutral-Negative Tweets')
    
    fig.suptitle('Words Per Tweet', fontsize=24, va='baseline')
    
    fig.tight_layout()

In [None]:
plot_word_number_histogram(df[df['Lab'] == 'Negative']['Content'],
                           df[df['Lab'] == 'Positive']['Content'],
                          df[df['Lab'] == 'Neutral']['Content'],
                          df[df['Lab'] == 'Neutral-Negative']['Content'])

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data.Lab)
df['categorical_label'] = le.transform(df.Lab)

display(df.sample(20))

In [None]:
X=df['lemma_str']

y=df['categorical_label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size = 0.2, random_state=24,stratify=y)

In [None]:
class TextBlobSentiment(Base):
    """Predict fine-grained sentiment classes using TextBlob."""
    def __init__(self, model_file: str=None) -> None:
        super().__init__()

    def score(self, text: str) -> float:
        # pip install textblob
        from textblob import TextBlob
        return TextBlob(text).sentiment.polarity

    def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        df = self.read_data(test_file, lower_case)
        df['score'] = df['text'].apply(self.score)
        # Convert float score to category based on binning
        df['pred'] = pd.cut(df['score'],
                            bins=5,
                            labels=[1, 2, 3, 4, 5])
        df = df.drop('score', axis=1)
        return df
