# python libraries

In [None]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas
import seaborn as sns 
from sklearn import preprocessing
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
import re
import string
import matplotlib.cm as cm
from matplotlib import rcParams
from prettytable import PrettyTable
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression

# Arabic_poem_positive dataset 

In [None]:
cols = ['class','poem_style']
positive = pd.read_csv('BookOpinionsDS_clean.csv',sep='\t', error_bad_lines = False ,header=None, names=cols)

# showing top 5 records

In [None]:
def swap_columns(positive, c1, c2):
    positive['temp'] = positive[c1]
    positive[c1] = positive[c2]
    positive[c2] = positive['temp']
    positive.drop(columns=['temp'], inplace=True)

In [None]:
swap_columns(positive,'class','poem_style')

In [None]:
positive.head()

# Count of positve 

In [None]:
print(len(positive))

# Getting poem text

In [None]:
positive['poem_style'].head()

# Getting target class

In [None]:
positive['class'].head()

# Checking null values

In [None]:
positive[positive.isnull().any(axis=1)].head()

# Checking null values count

In [None]:
np.sum(positive.isnull().any(axis=1))

In [None]:
positive.isnull().any(axis=0)

In [None]:
positive.info()

# cleaning poem text

In [None]:
for letter in '#.][!XR':
    positive['poem_style'] = positive['poem_style'].astype(str).str.replace(letter,'')

In [None]:
positive.head()

# Preprocess data

In [None]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# normalize_arabic

In [None]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

# remove_repeating_char

In [None]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

# applying processPost function for preprocessing

In [None]:
positive["poem_style"] = positive['poem_style'].apply(lambda x: processPost(x)) # apply used to call the method processpost

# Getting Tokenize the poem text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
positive["poem_style"] = positive["poem_style"].apply(tokenizer.tokenize)

In [None]:
positive["poem_style"].head()

# Stop words 

In [None]:
stopwords_list = stopwords.words('arabic')

In [None]:
stopwords_list

In [None]:
print(len(stopwords_list))

In [None]:
print(type(stopwords_list))

In [None]:
listToStr = ' '.join([str(elem) for elem in stopwords_list]) 

In [None]:
listToStr

# Removing stop words

In [None]:
positive["poem_style"]=positive["poem_style"].apply(lambda x: [item for item in x if item not in stopwords_list])

# poem text information

In [None]:
all_words = [word for tokens in positive["poem_style"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in positive["poem_style"]]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

# top 25 words in positive

In [None]:
counter = Counter(all_words)

In [None]:
counter.most_common(25)

In [None]:
counted_words = Counter(all_words)

words = []
counts = []
for letter, count in counted_words.most_common(25):
    words.append(letter)
    counts.append(count)

In [None]:
colors = cm.rainbow(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10

plt.title('Top words in positive')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)

# Arabic_poem_negative dataset 

In [None]:
cols = ['class','poem_style']
negative = pd.read_csv('BookOpinionsDS_clean.csv',sep='\t', error_bad_lines = False ,header=None, names=cols)

# showing top 5 records

In [None]:
negative.head()

# Count of negative poem

In [None]:
print(len(negative))

# Getting poem text

In [None]:
negative['poem_style'].head()

# Getting target class

In [None]:
negative['class'].head()

# Checking null values

In [None]:
negative[negative.isnull().any(axis=1)].head()

# Checking null values count

In [None]:
np.sum(negative.isnull().any(axis=1))

In [None]:
negative.isnull().any(axis=0)

In [None]:
negative.info()

# cleaning poem text

In [None]:
for letter in '#.][!XR':
    negative['poem_style'] = negative['poem_style'].astype(str).str.replace(letter,'')

In [None]:
negative.head()

# applying processPost function for preprocessing

In [None]:
negative["poem_style"] = negative['poem_style'].apply(lambda x: processPost(x)) # apply used to call the method processpost

# Getting Tokenize the poem text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
negative["poem_style"] = negative["poem_style"].apply(tokenizer.tokenize)

In [None]:
negative["poem_style"].head()

# Removing stop words

In [None]:
negative["poem_style"]=negative["poem_style"].apply(lambda x: [item for item in x if item not in stopwords_list])

# poem text information

In [None]:
all_words = [word for tokens in negative["poem_style"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in negative["poem_style"]]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

# top 25 words in negative

In [None]:
counter = Counter(all_words)

In [None]:
counter.most_common(25)

In [None]:
counted_words = Counter(all_words)

words = []
counts = []
for letter, count in counted_words.most_common(25):
    words.append(letter)
    counts.append(count)

In [None]:
colors = cm.rainbow(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10

plt.title('Top words in negative')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)

# Combining the positive and negative classes and tweets text

In [None]:
final_data = pd.concat([positive, negative], axis=0)

In [None]:
final_data.head()

# total count of final data

In [None]:
print(len(final_data))

# Count of each target class

In [None]:
y=final_data['class']
y.value_counts()

In [None]:
sns.countplot(data= final_data, x = "class")
plt.show()

# Features Extraction from poem text with TFIDF unigram

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    max_features =10000)

unigramdataGet= word_vectorizer.fit_transform(final_data['poem_style'].astype('str'))
unigramdataGet = unigramdataGet.toarray()

vocab = word_vectorizer.get_feature_names()
unigramdata_features=pd.DataFrame(np.round(unigramdataGet, 1), columns=vocab)
unigramdata_features[unigramdata_features>0] = 1

unigramdata_features.head()

# encoding class as 1 for Postive class and 0 for negative class

In [None]:
pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(final_data['class'])
final_data['class'] = encpro

# By getting features and Class

In [None]:
y=final_data['class']
X=unigramdata_features

# Spliting Dataset into 70% Training and 30% Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

# --------Trainig and Testing with Machine Learning Algorithms ----------------

# Naive Bayes Algorithm

In [None]:
nb=GaussianNB()
nb= nb.fit(X_train , y_train)
nb

#### Accuracy

In [None]:
y_pred = nb.predict(X_test)
nb_1=nb.score(X_test, y_test)
print('Accuracy= {:.3f}'.format(nb.score(X_test, y_test)))

# Precision

In [None]:
print('Precision',round(f1_score(y_test, y_pred),2),'%')

#### Recall

In [None]:
print('Recall',round(recall_score(y_test, y_pred),2),'%')

#### F1

In [None]:
rf_f1=round(f1_score(y_test, y_pred),2)
print('F1',round(f1_score(y_test, y_pred),2),'%')

# RidgeClassifier Algorithm

In [None]:
RC= RidgeClassifier()
RC= RC.fit(X_train , y_train)
RC

#### Accuracy

In [None]:
y_pred = RC.predict(X_test)
rc_1=RC.score(X_test, y_test)
print('Accuracy= {:.3f}'.format(RC.score(X_test, y_test)))

#### Precision

In [None]:
print('Precision',round(f1_score(y_test, y_pred),2),'%')

#### Recall

In [None]:
print('Recall',round(recall_score(y_test, y_pred),2),'%')

#### F1

In [None]:
rf_f1=round(f1_score(y_test, y_pred),2)
print('F1',round(f1_score(y_test, y_pred),2),'%')

# PassiveAggressiveClassifier Algorithm

In [None]:
PC= PassiveAggressiveClassifier()
PC= PC.fit(X_train , y_train)
PC

#### Accuracy

In [None]:
y_pred = PC.predict(X_test)
pc_1=PC.score(X_test, y_test)
print('Accuracy= {:.3f}'.format(PC.score(X_test, y_test)))

#### Precision

In [None]:
print('Precision',round(f1_score(y_test, y_pred),2),'%')

#### Recall

In [None]:
print('Recall',round(recall_score(y_test, y_pred),2),'%')

#### F1

In [None]:
rf_f1=round(f1_score(y_test, y_pred),2)
print('F1',round(f1_score(y_test, y_pred),2),'%')