In [21]:
# Load the dataset
import pandas as pd

df = pd.read_csv('train.csv')

df = df.dropna(subset=['text'])
df['all_text'] = df['title'] + ' ' + df['text']
df = df.drop(columns=['id', 'author', 'title', 'text'])

# Drop empty rows
df = df.dropna(subset=['all_text'])

In [22]:
# Preprocess the data
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove non-letter characters
    text = ''.join(c for c in text if c.isalpha() or c.isspace())
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize, alternative nltk.word_tokenize(text)
    tokens = text.split()
    
    # Remove stopwords and perform stemming
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    processed_tokens = [ps.stem(word) for word in tokens if not word in stop_words]
    processed_tokens = ' '.join(processed_tokens)
    
    return processed_tokens

X = df['all_text']
y = df['label']

In [23]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
print(X_train)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

# Convert text data to sequences
training_sequences = tokenizer.texts_to_sequences(X_train)
testing_sequences = tokenizer.texts_to_sequences(X_test)

8139     NY Times: Being a Sanctuary City ’Not Enough’ ...
12358    What’s at Stake in Trump’s Proposed E.P.A. Cut...
9562     Rick Rule: Broadcast Interview – Available Now...
5979     Hillary Clinton Cancels Public Events And Vani...
19059    Trump Declared The Winner By Jon Rappoport Bef...
                               ...                        
11625    Chaiwali, an Indian Restaurant That Feels Like...
12329    Recipe: Mouth-Watering Cauliflower, Coconut oi...
5553     Las cajetillas de tabaco emitirán música de Me...
885      Open Borders Groups Gird for H-1B Fights The o...
16251    Eighty Wealthy New Yorkers Ask State Governmen...
Name: all_text, Length: 16162, dtype: object
214620


In [34]:
# Pad the sequences to ensure uniform length
max_length = 200
train_pad = pad_sequences(training_sequences, maxlen=max_length, padding='post')
test_pad = pad_sequences(testing_sequences, maxlen=max_length, padding='post')

In [35]:
print(train_pad[0])

[ 102 5913 3484 6030   24    1 9663  587    7    5 5553   10    1   90
 3238 6750   36 1976 5631 7842   10    1 6914   36   42 1353   44  892
 1077   26 1633    3 5082  324   15 2064 3995   38 2119 5284  863    2
 1602 1022 6954   59  222  871   83   22 1022 3388    1  116    1  587
    7  632 2063   62   22 4327    9 6750   63 1333 2222   38  105 2315
 2265   11  539    7  228 9092    2   21   25 1147   20  274   75    1
  101 1587 1708  980   49   66 1778    6    1   12    4 1330 4955   55
  924  478 3125 1921  273  435    4 3680  708   28   50  324    2 1267
  980    2  409 2173  111   31 1706    9  928    2 1188    9 9093  890
   11    1  888   10    2  147    7  141 3388  683    3  632  928 2259
   99   33   21 1472 1077  833  237 9275   10    1  850   11  265 1708
   28  184  212  980    2 2802 1921    4 6077 2865  924 4436    1  811
 1218    7  480    2 1022 6954  409  158    4 1871   42   22  388    1
    9 1022 3388   11    1 7417  372    8    5 3890    9  243  635  377
   81 

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping

# Define EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

# Define the model architecture
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_length),
    LSTM(128, dropout=0.2, return_sequences=True),
    LSTM(64, dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_pad, y_train, validation_data=(test_pad, y_test), epochs=10, batch_size=64, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 8: early stopping


<keras.callbacks.History at 0x152abd45760>

In [37]:
# Evaluate the model
loss, accuracy = model.evaluate(test_pad, y_test)
print('Loss: %f' % loss)
print('Accuracy: %f' % accuracy)

Loss: 0.235181
Accuracy: 0.930463


In [40]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# Function to create model, required for KerasClassifier
def create_model(dropout_rate=0.0, optimizer='adam'):
    # Define the model architecture
    model = Sequential([
        Embedding(vocab_size, 128, input_length=max_length),
        LSTM(128, dropout=dropout_rate, return_sequences=True),
        LSTM(64, dropout=dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=64, verbose=1)

# Define the parameter grid
param_grid = {
    'dropout_rate': [0.2, 0.3],
    'optimizer': ['adam', 'sgd']
}

# Create GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, verbose=1)

# Fit the grid search
grid_result = grid.fit(train_pad, y_train, validation_data=(test_pad, y_test), callbacks=[es])

# Print the best parameters and best accuracy
print("Best parameters: ", grid_result.best_params_)
print("Best accuracy: ", grid_result.best_score_)

  model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=64, verbose=1)


Fitting 2 folds for each of 4 candidates, totalling 8 fits
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 8: early stopping
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping
Best parameters:  {'dropout_rate': 0