In [None]:
import pandas as pd

In [None]:
# Read datasets into pandas dataframes
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

In [None]:
# See unique values in keyword and location columns to have an idea about the data
for col in train_df.columns[1:len(train_df.columns)-2]:
    print('{} values of {} column: {} \n '.format(len(train_df[col].value_counts()), col, train_df[col].unique()))

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer, SnowballStemmer

'''
STEMMING:
PorterStemmer and SnowballStemmer give similar results but Porter stemmer is an older algorithm.
It’s from the 1980s and its main concern is removing the common endings to words so that they can be resolved 
to a common form. Typically, it’s not really advised to use it for any production/complex application. 
Instead, it has its place in research as a nice, basic stemming algorithm that can guarantee reproducibility. 
It also is a very gentle stemming algorithm when compared to others.
Snowball stemmer is also known as the Porter2 stemming algorithm. 
It is almost universally accepted as better than the Porter stemmer, 
even being acknowledged as such by the individual who created the Porter stemmer. 
That being said, it is also more aggressive than the Porter stemmer.
A lot of the things added to the Snowball stemmer were because of issues noticed with the Porter stemmer. 
There is about a 5% difference in the way that Snowball stems versus Porter.


One more thing before I wrap up here: If you choose to use either lemmatization or stemming in your NLP application, 
always be sure to test performance with that addition. In many applications, 
you may find that either ends up messing with performance in a bad way just as often as it helps boost performance. 
Both of these techniques are really designed with recall in mind, but precision tends to suffer as a result. 
But if recall is what you’re aiming for (like with a search engine) then maybe that’s alright!
https://towardsdatascience.com/stemming-lemmatization-what-ba782b7c0bd8
'''

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# download list of stopwords (only once; need not run it again)

nltk.download("stopwords")
nltk.download('punkt')   

In [None]:
stop_words = set(stopwords.words('english'))
import regex as re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Function to preprocess data with NLTK
def preprocess_nltk(text):
    text = [re.sub(r'[^a-zA-Z0-9]', ' ', text) for text in text]
    filtered_sentence = [word for word in text if not word in stop_words]
    words = [word_tokenize(PorterStemmer().stem(WordNetLemmatizer(). lemmatize(w, pos='v'))) for w in filtered_sentence]
    return words

# Function to preprocess data with Gensim
def preprocess_gensim(text):
    # Remove non-alphanumeric characters from data
    text = [re.sub(r'[^a-zA-Z0-9]', ' ', text) for text in text]
    
    # Lemmatize, stem and tokenize words in the dataset, removing stopwords
    text = [(PorterStemmer().stem(WordNetLemmatizer(). lemmatize(w, pos='v')) )for w in text]
    result = [[token for token in gensim.utils.simple_preprocess(sentence) if not token in 
              gensim.parsing.preprocessing.STOPWORDS and len(token) > 3] for sentence in text]
    return result

In [None]:
# NTLK stopwords vs. Gensim stopwords
print('NTLK stopwords: {} \n \n \n Gensim stopwords: {}'.format(stop_words, gensim.parsing.preprocessing.STOPWORDS))

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Adding more stopwords unique to the data to the WordCloud stopwords list
additional_wordcloud_stopwords = ['http', 'https', 'co', 'amp', 'you', 'to', 'us', 'will']

#Create a WordCloud of real disaster tweets
real_data = train_df[train_df['target'] == 1]
combined_real_data = real_data['text'].tolist()
combined_real_data = [re.sub(r'[^a-zA-Z0-9]', ' ', text) for text in combined_real_data]
combined_real_data = " ".join([review for review in combined_real_data])

wc = WordCloud(background_color='white', max_words=50,
              stopwords=STOPWORDS.update(additional_wordcloud_stopwords))

plt.imshow(wc.generate(combined_real_data))
plt.axis('off')
plt.show()

In [None]:
#Create a WordCloud of fake disaster tweets
fake_data = train_df[train_df['target'] == 0]
combined_fake_data = fake_data['text'].tolist()
combined_fake_data = [re.sub(r'[^a-zA-Z0-9]', ' ', text) for text in combined_fake_data]
combined_fake_data = " ".join([review for review in combined_fake_data])

wc = WordCloud(background_color='white', max_words=50,
              stopwords=STOPWORDS.update(additional_wordcloud_stopwords))

plt.imshow(wc.generate(combined_fake_data))
plt.axis('off')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

#Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(train_df['text'].to_list(), train_df['target'].to_list(),
                                                    random_state=0)
# Carry out preprocessing on text data
words_train, words_test = preprocess_gensim(X_train), preprocess_gensim(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Extract Bag-of-Words (BoW)
vectorizer = CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
features_train = vectorizer.fit_transform(words_train).toarray()

features_test = vectorizer.transform(words_test).toarray()

# Create a vocabulary from the dataset
vocabulary = vectorizer.vocabulary_

In [None]:
import random

# View random words in the vocabulary and confirm BoW representation of train and test data
print("Vocabulary: {} words".format(len(vocabulary)))
print("Sample words: {}".format(random.sample(list(vocabulary.keys()), 8)))
print('\n')
print(features_train[0])
print(features_test[0])

In [None]:
'''
Zipf's law
Zipf's law, named after the famous American linguist George Zipf, 
is an empirical law stating that given a large collection of documents, 
the frequency of any word is inversely proportional to its rank in the frequency table. 
So the most frequent word will occur about twice as often as the second most frequent word, 
three times as often as the third most frequent word, and so on. 
In the figure below we plot number of appearances of each word in our training set against its rank.
'''

In [None]:
import numpy as np

# Find number of occurrences for each word in the training set
word_freq = features_train.sum(axis=0)

# Sort it in descending order
sorted_word_freq = np.sort(word_freq)[::-1]

# Plot 
plt.plot(sorted_word_freq)
plt.gca().set_xscale('log')
plt.gca().set_yscale('log')
plt.xlabel('Rank')
plt.ylabel('Number of occurrences')
plt.show()

In [None]:
import sklearn.preprocessing as pr

# TODO: Normalize BoW features in training and test set
features_train = pr.normalize(features_train, axis=0)
features_test = pr.normalize(features_test, axis=0)

In [None]:
from sklearn.naive_bayes import GaussianNB

# TODO: Train a Guassian Naive Bayes classifier
nb = GaussianNB()

nb.fit(features_train, y_train)

# Calculate the mean accuracy score on training and test sets
print("[{}] Accuracy: train = {}, test = {}".format(
        nb.__class__.__name__,
        nb.score(features_train, y_train),
        nb.score(features_test, y_test)))

'''
Tree-based algorithms often work quite well on Bag-of-Words as their highly discontinuous and sparse nature 
is nicely matched by the structure of trees. 
As your next task, you will try to improve on the Naive Bayes classifier's performance by using 
scikit-learn's Gradient-Boosted Decision Tree classifer.
'''

In [None]:
# Try Logistic Regression
from sklearn.linear_model import LogisticRegressionCV

logreg = LogisticRegressionCV(cv=5, scoring='accuracy', verbose=3, random_state=0, max_iter=1000)

logreg.fit(features_train, y_train)

print("[{}] Accuracy: train = {}, test = {}".format(
        logreg.__class__.__name__,
        logreg.score(features_train, y_train),
        logreg.score(features_test, y_test)))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=5, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(features_train, y_train)

print('Accuracy of the GBM on training set: {:.3f}'.format(gbc.score(features_train, y_train)))
print('Accuracy of the GBM on test set: {0:.3f}'.format(gbc.score(features_test, y_test)))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

params_NB = {'var_smoothing': np.logspace(0,-9, num=10)}
cv_method = RepeatedStratifiedKFold(n_splits=2, 
                                    n_repeats=3, 
                                    random_state=0)

gs_NB = GridSearchCV(estimator=nb, 
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='accuracy')


gs_NB.fit(features_train, y_train);

In [None]:
print(gs_NB.best_params_)
print(gs_NB.best_score_)

In [None]:
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression


logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=1000, random_state=0)
hyperparameters = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
param_grid = {'C': [100, 10, 1.0, 0.1, 0.01]}
k = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=0)

grid = GridSearchCV(logistic, param_grid=param_grid, cv=k, n_jobs=4, verbose=1)
grid.fit(features_train, y_train)

print('Best C:', grid.best_params_)

In [None]:
print('Accuracy of the GridSearch on training set: {:.3f}'.format(grid.score(features_train, y_train)))
print('Accuracy of the GridSearch on test set: {0:.3f}'.format(grid.score(features_test, y_test)))

In [None]:
from sklearn.model_selection import cross_val_score   #Additional scklearn functions

cv_score1 = cross_val_score(gbc, features_train, y_train, cv=3, scoring='roc_auc')
cv_score2 = cross_val_score(nb, features_train, y_train, cv=3, scoring='roc_auc')
cv_score3 = cross_val_score(logreg, features_train, y_train, cv=3, scoring='roc_auc')
cv_score4 = cross_val_score(grid, features_train, y_train, cv=3, scoring='roc_auc')

In [None]:
print(cv_score1)
print(cv_score2)
print(cv_score3)
print(cv_score4)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

y_pred1 = nb.predict(features_test)
y_pred2 = logreg.predict(features_test)
y_pred3 = grid.predict(features_test)


print("---Test Set Results---")
print("Accuracy with Gaussian: {}".format(accuracy_score(y_test, y_pred1)))
print("Accuracy with logreg: {}".format(accuracy_score(y_test, y_pred2)))
print("Accuracy with logreg: {}".format(accuracy_score(y_test, y_pred3)))

print("AUC Score with Gaussian: {}".format(roc_auc_score(y_test, y_pred1)))
print("AUC Score with Gaussian: {}".format(roc_auc_score(y_test, y_pred2)))
print("AUC Score with Gaussian: {}".format(roc_auc_score(y_test, y_pred3)))


print(classification_report(y_test, y_pred1))
print(classification_report(y_test, y_pred2))
print(classification_report(y_test, y_pred3))

In [None]:
test = test_df['text'].to_list()

In [None]:
test = preprocess_gensim(test)

In [None]:
test = vectorizer.transform(test).toarray()

In [None]:
test_pred = grid.predict(test)

In [None]:
test_pred

In [None]:
# Switch to RNNs

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(words_train)

X_train = tokenizer.texts_to_sequences(words_train)
X_test = tokenizer.texts_to_sequences(words_test)

vocabulary_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(words_train[2])

In [None]:
# Find the maximum number of words in a tweet

max_length = 0
for i,x in enumerate(words_train):
    if len(words_train[i]) > max_length:
        max_length = len(words_train[i])

print(max_length)

In [None]:
from keras.preprocessing.sequence import pad_sequences 

# Set the maximum number of words per document (for both training and testing) by padding sequences 
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocabulary_size, 
                           output_dim=embedding_dim, 
                           input_length=max_length))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# Fit and evaluate the model
model.fit(X_train, y_train, epochs=50, verbose=False, validation_data=(X_test, y_test), batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
import os

# Save the model
cache_dir = os.path.join("cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)

model_file = "rnn_model.h5"  # HDF5 file
model.save(os.path.join(cache_dir, model_file))

# Later you can load it using keras.models.load_model()
#from keras.models import load_model
#model = load_model(os.path.join(cache_dir, model_file))

In [None]:
! pip freeze > requirements.txt