# Loading needed packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import string
import nltk    
import matplotlib.pyplot as plt    

from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn import feature_extraction, linear_model, model_selection, preprocessing
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# Load some useful functions below

Use below function to process tweet, it would return a list of words

In [None]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    tweet = re.sub(r'\$\w*', '', tweet)

    tweet = re.sub(r'^RT[\s]+', '', tweet)
  
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)


    tweet = re.sub(r'#', '', tweet)

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation

            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

Use below function to build the frequency matrix, in the form of ('word',1/0):number of words

In [None]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """

    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

# Explore datasets

In [None]:
test = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv').sample(2000)
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train = train.append(test).drop_duplicates(keep=False)

validation = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

Download stop words list

In [None]:
nltk.download('stopwords')

# Data preparation

In [None]:
train_df1 = list(train[train['target']==1]['text'])
train_df0 = list(train[train['target']==0]['text'])
print(len(train_df1))
print(len(train_df0))

In [None]:
tweets = train_df1 + train_df0

# let's see how many tweets we have
print("Number of tweets: ", len(tweets))

labels = np.append(np.ones((len(train_df1))),np.zeros((len(train_df0))))


In [None]:
# create frequency dictionary
freqs = build_freqs(tweets, labels)

# Model prepare - sigmoid, gradient descent

In [None]:
# Define sigmoid function
def sigmoid(z): 
    h = h = 1/(1+np.exp(z*(-1)))    
    return h

In [None]:
def gradientDescent(x, y, theta, alpha, num_iters):

    m = len(x)
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = ((-1)/m)*(np.dot(np.transpose(y),np.log(h))+np.dot(np.transpose(1-y),np.log(1-h)))

        # update the weights theta
        theta = theta - ((alpha/m*(np.dot(np.transpose(x),(h-y)))))
        
    J = float(J)
    return J, theta

# Feature Extraction

In [None]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    word_l = process_tweet(tweet)
    x = np.zeros(3) 
    x[0] = 1 
    
    for word in word_l:

        x[1] += freqs.get((word,1),0)

        x[2] += freqs.get((word,0),0)
    
    x = x[None, :]
    assert(x.shape == (1, 3))
    return x

In [None]:
train_x = tweets
train_y = pd.DataFrame({'col':labels})

test_x = test['text']
test_y = test['target']

# Define Prediction Function

In [None]:
def predict_tweet(tweet, freqs, theta):

    x = extract_features(tweet, freqs)
    y_pred = sigmoid(np.dot(x,theta))

    return y_pred

Performance monitor

In [None]:
def test_logistic_regression(test_x, test_y, freqs, theta, predict_tweet=predict_tweet):

    y_hat = []
    
    for tweet in test_x:
        y_pred = predict_tweet(tweet,freqs,theta)
        
        if y_pred > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0.0)


    #precision = np.dot(np.where(np.array(y_hat)==1,1,0),np.where(np.array(y_hat)==np.squeeze(test_y),1,0))/np.sum(np.array(y_hat)==1)
    recall = np.dot(np.where(np.array(y_hat)==1,1,0),np.where(np.array(y_hat)==np.squeeze(test_y),1,0))/np.sum(np.array(test_y)==1)
    accuracy = recall
    
    return accuracy

In [None]:
np.dot(np.where((np.array([0,1])==np.array([0,1])),1,0), np.where(np.array([0,1])==np.array([0,1]),1,0))

In [None]:
np.where(np.array([0,1])==np.array([0.1]),1,0)

# Training the model

In [None]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

Y = train_y

#for itr in [6000,8000,10000,12000]:
#    for alpha in [1e-8,1e-9]:
#        J, theta = gradientDescent(X, Y, np.zeros((3, 1)), alpha, itr)
        # print(f"The cost after training is {J:.8f}.")
        # print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
#        tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
#        print(f"With iteration set as {itr}. alpha set as {alpha}. The logistic regression model's accuracy = {tmp_accuracy:.4f}")

In [None]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-08,6000)

In [None]:
# Test the model, comment those as they are not needed

# for tweet in ['there is a fire', 'there is sandstorm', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
#    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))    

# my_tweet = 'alert, there is earthquake'
# predict_tweet(my_tweet, freqs, theta)

In [None]:
# Below code is to examine the prediction tweet by tweet, comment those as these are not needed

# print('Label Predicted Tweet')
# for x,y in zip(train_x,labels):
#    y_hat = predict_tweet(x, freqs, theta)

#    if np.abs(y - (y_hat > 0.5)) > 0:
#        print('THE TWEET IS:', x)
#        print('THE PROCESSED TWEET IS:', process_tweet(x))
#        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

In [None]:
# Use below code to test the model

my_tweet = 'Something is happening. I can see the flame from that building!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Potential disaster')
else: 
    print('Maybe not a disaster')

# Making Prediction based on test data

In [None]:
y_hat_list = []
for x in validation['text']:
    # print(x)
    # x_process = process_tweet(x)
    # print(x_process)
    y_hat = predict_tweet(x, freqs, theta)
    y_hat_list.append(int(y_hat > 0.5))

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = pd.DataFrame({'target':y_hat_list})
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission