In [1]:
import numpy as np
import pandas as pd
import string, re

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

In [2]:
def clean_input(text_to_clean):
  text_to_clean = text_to_clean.lower()
  text_to_clean = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text_to_clean)
  text_to_clean = re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text_to_clean)
  text_to_clean = re.sub(r'[0-9]+', 'number', text_to_clean)
  text_to_clean = re.sub(r'[^a-zA-Z\u0621-\u064A\s]', '', text_to_clean)
  text_to_clean = re.sub(r'numbernumber+', 'number', text_to_clean)
  return text_to_clean

In [3]:
train=pd.read_csv('drive/MyDrive/Colab Notebooks/Dataset/train.csv')
test=pd.read_csv('drive/MyDrive/Colab Notebooks/Dataset/test.csv')
y_test=pd.read_csv('drive/MyDrive/Colab Notebooks/Dataset/submit.csv')

In [4]:
print(f"Train Shape : {train.shape}")
print(f"Test Shape : {test.shape}")
print(f"Submit Shape : {y_test.shape}")

Train Shape : (20800, 5)
Test Shape : (5200, 4)
Submit Shape : (5200, 2)


In [5]:
train.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
train.dtypes.value_counts()

object    3
int64     2
dtype: int64

In [7]:
def fill_data(data):
    data["title"] = data["title"].fillna("No Title")
    data["text"] = data["text"].fillna("No text")
    return data

train = fill_data(train)
test = fill_data(test)

In [8]:
train.isnull().sum()

id           0
title        0
author    1957
text         0
label        0
dtype: int64

In [9]:
test.isnull().sum()

id          0
title       0
author    503
text        0
dtype: int64

In [10]:
train['text_merge'] = train['title'].astype(str) + " " + train['text'].astype(str)
x_train = train['text_merge']
y_train = train.drop(['id','title','author','text','text_merge'],axis=1)
y_train = np.array(y_train)

print(x_train.shape)
print(y_train.shape)


(20800,)
(20800, 1)


In [11]:
test['text_merge'] = test['title'].astype(str) + " " + test['text'].astype(str)
x_test = test['text_merge']
y_test = y_test.drop(['id'],axis=1)
y_test = np.array(y_test)

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

punc = list(string.punctuation)
stop_words = stopwords.words("english")
stop_words_2 = stopwords.words("arabic")


def processing_text(data):
    data.lower()

    data = " ".join([word for word in word_tokenize(data)
                        if ((word not in stop_words) and (word not in stop_words_2) and (word not in punc))])

    return data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
for i in range(x_train.shape[0]):
    x_train.values[i] = clean_input(x_train[i])
    x_train.values[i] = processing_text(x_train[i])

In [14]:
x_train.head()

0    house dem aide didnt even see comeys letter ja...
1    flynn hillary clinton big woman campus breitba...
2    truth might get fired truth might get fired oc...
3    number civilians killed single us airstrike id...
4    iranian woman jailed fictional unpublished sto...
Name: text_merge, dtype: object

In [15]:
x_test.head()

0    Specter of Trump Loosens Tongues, if Not Purse...
1    Russian warships ready to strike terrorists ne...
2    #NoDAPL: Native American Leaders Vow to Stay A...
3    Tim Tebow Will Attempt Another Comeback, This ...
4    Keiser Report: Meme Wars (E995) 42 mins ago 1 ...
Name: text_merge, dtype: object

In [16]:
number_words = 1000
maxlen = 4000
truncat = 'post'
padd = 'post'
token = '<OOV>'

tokenizer = Tokenizer(oov_token = token)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

sequences_train = tokenizer.texts_to_sequences(x_train)
X_train = pad_sequences(sequences_train, maxlen = maxlen, truncating = truncat, padding = padd)

sequences_test = tokenizer.texts_to_sequences(x_test)
X_test = pad_sequences(sequences_test, maxlen = maxlen, truncating = truncat, padding = padd)

In [17]:
# X_train_float = X_train.astype('float32')
# X_test_float = X_test.astype('float32')

# max_val = np.max([np.max(X_train_float), np.max(X_test_float)])
# min_val = np.min([np.min(X_train_float), np.min(X_test_float)])

# X_train_normalized = (X_train_float - min_val) / (max_val - min_val)
# X_test_normalized = (X_test_float - min_val) / (max_val - min_val)

# X_train_normalized = np.clip(X_train_normalized, 0, 1)
# X_test_normalized = np.clip(X_test_normalized, 0, 1)

# print(X_train_normalized.shape)
# print(X_test_normalized.shape)

In [18]:
print(X_train.shape)
print(X_test.shape)
# print(X_train)
# print(X_test)

(20800, 4000)
(5200, 4000)


In [19]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

In [20]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

In [21]:
def generate_wt(rows, columns):
    return np.random.randn(rows, columns)

In [22]:
def binary_crossentropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [23]:
def generate_wt(rows, columns):
    return np.random.randn(rows, columns)

In [24]:
class RpropNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.weights_input_hidden = np.random.uniform(-1, 1, (self.input_size, self.hidden_size))
        self.weights_hidden_output = np.random.uniform(-1, 1, (self.hidden_size, self.output_size))

        self.bias_hidden = np.random.uniform(-1, 1, (1, self.hidden_size))
        self.bias_output = np.random.uniform(-1, 1, (1, self.output_size))

        self.prev_gradients_wih = np.ones((self.input_size, self.hidden_size))
        self.prev_gradients_who = np.ones((self.hidden_size, self.output_size))
        self.delta_wih = np.full((self.input_size, self.hidden_size), 0.1)
        self.delta_who = np.full((self.hidden_size, self.output_size), 0.1)
        self.eta_minus = 0.5
        self.eta_plus = 1.2
        self.delta_max = 50.0
        self.delta_min = 1e-6

        self.best_weights_input_hidden = None
        self.best_weights_hidden_output = None
        self.best_bias_hidden = None
        self.best_bias_output = None
        self.best_accuracy = 0.0

    def train(self, X, y, epochs):
        for epoch in range(epochs):

            hidden_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
            hidden_output = sigmoid(hidden_input)
            output_input = np.dot(hidden_output, self.weights_hidden_output) + self.bias_output
            output = sigmoid(output_input)

            accuracy = np.mean(np.round(output) == y)
            loss = np.mean(np.square(y - output))

            print(f"Epoch {epoch + 1}/{epochs} - Accuracy: {accuracy:.4f} - Loss: {loss:.4f}")

            output_error = y - output
            output_delta = 2 * output_error * sigmoid_derivative(output)
            hidden_error = output_delta.dot(self.weights_hidden_output.T)
            hidden_delta = hidden_error * sigmoid_derivative(hidden_output)

            if accuracy > self.best_accuracy:
                self.best_weights_input_hidden = self.weights_input_hidden.copy()
                self.best_weights_hidden_output = self.weights_hidden_output.copy()
                self.best_bias_hidden = self.bias_hidden.copy()
                self.best_bias_output = self.bias_output.copy()

                self.best_accuracy = accuracy

            self.update_weights_rprop(X, hidden_output, output_delta, hidden_delta)

        self.weights_input_hidden = self.best_weights_input_hidden
        self.weights_hidden_output = self.best_weights_hidden_output
        self.bias_hidden = self.best_bias_hidden
        self.bias_output = self.best_bias_output


    def update_weights_rprop(self, X, hidden_output, output_delta, hidden_delta):
        gradient_who = hidden_output.T.dot(output_delta)
        sign_who = np.sign(gradient_who * self.prev_gradients_who)
        self.delta_who = np.where(sign_who > 0, np.minimum(self.delta_who * self.eta_plus, self.delta_max),
                                  np.maximum(self.delta_who * self.eta_minus, self.delta_min))
        self.weights_hidden_output += sign_who * self.delta_who
        self.prev_gradients_who = gradient_who

        gradient_wih = X.T.dot(hidden_delta)
        sign_wih = np.sign(gradient_wih * self.prev_gradients_wih)
        self.delta_wih = np.where(sign_wih > 0, np.minimum(self.delta_wih * self.eta_plus, self.delta_max),
                                  np.maximum(self.delta_wih * self.eta_minus, self.delta_min))
        self.weights_input_hidden += sign_wih * self.delta_wih
        self.prev_gradients_wih = gradient_wih

        self.bias_output += np.mean(output_delta, axis=0)
        self.bias_hidden += np.mean(hidden_delta, axis=0)



In [25]:
input_size = X_train.shape[1]
hidden_size = 1000
output_size = 1
epochs = 10

rprop_nn = RpropNN(input_size, hidden_size, output_size)

rprop_nn.train(X_train, y_train, epochs)

  return 1 / (1 + np.exp(-x))


Epoch 1/10 - Accuracy: 0.4790 - Loss: 0.4739
Epoch 2/10 - Accuracy: 0.6014 - Loss: 0.3866
Epoch 3/10 - Accuracy: 0.6229 - Loss: 0.3623
Epoch 4/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 5/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 6/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 7/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 8/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 9/10 - Accuracy: 0.5006 - Loss: 0.4994
Epoch 10/10 - Accuracy: 0.5006 - Loss: 0.4994


In [26]:
best_weights_input_hidden = rprop_nn.best_weights_input_hidden
best_weights_hidden_output = rprop_nn.best_weights_hidden_output
best_bias_hidden = rprop_nn.best_bias_hidden
best_bias_output = rprop_nn.best_bias_output

In [27]:
hidden_input_test = np.dot(X_test, best_weights_input_hidden) + best_bias_hidden
hidden_output_test = sigmoid(hidden_input_test)

output_input_test = np.dot(hidden_output_test, best_weights_hidden_output) + best_bias_output
output_test = sigmoid(output_input_test)

accuracy_test = np.mean(np.round(output_test) == y_test)
print(f"Test Accuracy: {accuracy_test:.4f}")


Test Accuracy: 0.7808


  return 1 / (1 + np.exp(-x))


In [28]:
# hidden_input1_test = np.dot(X_test, rprop_nn.weights_input_hidden1) + rprop_nn.bias_hidden1
# hidden_output1_test = sigmoid(hidden_input1_test)

# hidden_input2_test = np.dot(hidden_output1_test, rprop_nn.weights_hidden1_hidden2) + rprop_nn.bias_hidden2
# hidden_output2_test = sigmoid(hidden_input2_test)

# output_input_test = np.dot(hidden_output2_test, rprop_nn.weights_hidden2_output) + rprop_nn.bias_output
# output_test = sigmoid(output_input_test)

# accuracy_test = np.mean(np.round(output_test) == y_test)
# print(f"Test Accuracy: {accuracy_test:.4f}")