In [11]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#Data Preprocessing and Feature Engineering
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
from nltk.tokenize import word_tokenize
from string import punctuation  
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string


#Model Selection and Validation
from nltk import classify
from nltk import NaiveBayesClassifier

from nltk.metrics import precision, recall, f_measure, ConfusionMatrix




In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [175]:
data = pd.read_csv('train.csv')

In [5]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Preprocessing

In [106]:
def preprocessed_tweet(tweet):
    '''
    Removes unsolicited symbols, stop words
    Completes stemming 
    '''
    stopwords = set(nltk.corpus.stopwords.words('english') + ['AT_USER','URL'])
    
    tweet = ' '.join(word.strip(string.punctuation) for word in tweet.split())
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www[^\s]+)|(http[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'', tweet) # remove the # in #hashtag
    tweet = word_tokenize(tweet)
    
    ps = PorterStemmer()
    clean_tweets = []
    for word in tweet:
        if (word not in stopwords and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = ps.stem(word) # stemming word
            clean_tweets.append(stem_word)
    return clean_tweets

# Naive Byas Classifier without additional features

### Cteate the test and train sets

In [107]:
def bag_of_words(tweet):
    '''
    Create a bag of words
    '''
    words = preprocessed_tweet(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [108]:
def train_test_split(data):
    true_data = data[data.target == 1]
    true_data = true_data.text

    false_data = data[data.target == 0]
    false_data = false_data.text

    true_tweets_set = []
    for tweet in true_data:
        true_tweets_set.append((bag_of_words(tweet), 1))

    false_tweets_set = []
    for tweet in false_data:
        false_tweets_set.append((bag_of_words(tweet), 0))
        
    shuffle(true_tweets_set)
    shuffle(false_tweets_set)

    false_tweets_set = false_tweets_set[:3271]

    size = int(0.2 * len(false_tweets_set))

    test_set = true_tweets_set[:size] + false_tweets_set[:size]
    train_set = true_tweets_set[size:] + false_tweets_set[size:]
    return test_set, train_set


In [109]:
test_set, train_set = train_test_split(data)

### Train the classifier. 

In [110]:
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) 
 
print (classifier.show_most_informative_features(10)) 

0.7691131498470948
Most Informative Features
               hiroshima = True                1 : 0      =     40.3 : 1.0
                  bomber = True                1 : 0      =     31.0 : 1.0
                    atom = True                1 : 0      =     28.3 : 1.0
                 wildfir = True                1 : 0      =     28.2 : 1.0
                   sever = True                1 : 0      =     27.7 : 1.0
                 suspect = True                1 : 0      =     23.7 : 1.0
                 typhoon = True                1 : 0      =     23.7 : 1.0
                outbreak = True                1 : 0      =     21.7 : 1.0
                      70 = True                1 : 0      =     20.3 : 1.0
                 airport = True                1 : 0      =     18.3 : 1.0
None


### Metrics of the classifier. 

In [113]:
def show_metrics(test_set):
    actual_set = defaultdict(set)
    predicted_set = defaultdict(set)
 
    actual_set_cm = []
    predicted_set_cm = []
 
    for index, (feature, actual_label) in enumerate(test_set):
        actual_set[actual_label].add(index)
        actual_set_cm.append(actual_label)
 
        predicted_label = classifier.classify(feature)
 
        predicted_set[predicted_label].add(index)
        predicted_set_cm.append(predicted_label)
    

    print ('Metrics for class True')
    print ('true precision: ', precision(actual_set[1], predicted_set[1]))
    print ('true recall:', recall(actual_set[1], predicted_set[1]))
    print ('true F-measure:', f_measure(actual_set[1], predicted_set[1]))

    print ('\nMetrics for class False ')
    print ('false precision:', precision(actual_set[0], predicted_set[0]))
    print ('false recall:', recall(actual_set[0], predicted_set[0])) 
    print ('false F-measure:', f_measure(actual_set[0], predicted_set[0])) 
    print ('\nConfusion Matrix ')
    cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
    print (cm)

In [114]:
show_metrics(test_set)

Metrics for class True
true precision:  0.7658610271903323
true recall: 0.7752293577981652
true F-measure: 0.770516717325228

Metrics for class False 
false precision: 0.7724458204334366
false recall: 0.7629969418960245
false F-measure: 0.7676923076923077

Confusion Matrix 
  |   0   1 |
--+---------+
0 |<499>155 |
1 | 147<507>|
--+---------+
(row = reference; col = test)



The final F-scores are equal to 0.77 and 0.76 for the classes True and False respectively.

## Evaluation 

In the previous part during the preprocessing symbol of # and links were removed. Now the model will be runed without removing this features. 

In [116]:
def preprocessed_tweet_evoluated(tweet):
    '''
    Removes unsolicited symbols, stop words
    Completes stemming 
    '''
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    tweet = ' '.join(word.strip(string.punctuation) for word in tweet.split())
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www[^\s]+)|(http[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r' hashtag', tweet) # remove the # in #hashtag
    tweet = word_tokenize(tweet)
    
    ps = PorterStemmer()
    clean_tweets = []
    for word in tweet:
        if (word not in stopwords and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = ps.stem(word) # stemming word
            clean_tweets.append(stem_word)
            
    return clean_tweets

In [117]:
def bag_of_words_evoluated(tweet):
    '''
    Create a bag of words
    '''
    words = preprocessed_tweet_evoluated(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [118]:
def train_test_split_evoluated(data):
    true_data = data[data.target == 1]
    true_data = true_data.text

    false_data = data[data.target == 0]
    false_data = false_data.text

    true_tweets_set = []
    for tweet in true_data:
        true_tweets_set.append((bag_of_words_evoluated(tweet), 1))

    false_tweets_set = []
    for tweet in false_data:
        false_tweets_set.append((bag_of_words_evoluated(tweet), 0))
        
    shuffle(true_tweets_set)
    shuffle(false_tweets_set)

    false_tweets_set = false_tweets_set[:3271]

    size = int(0.2 * len(false_tweets_set))

    test_set = true_tweets_set[:size] + false_tweets_set[:size]
    train_set = true_tweets_set[size:] + false_tweets_set[size:]
    return test_set, train_set


In [128]:
ev_test_set, ev_train_set = train_test_split_evoluated(data)

In [129]:
ev_classifier = NaiveBayesClassifier.train(ev_train_set)
 
accuracy = classify.accuracy(ev_classifier, ev_test_set)
print(accuracy) 
 
print (ev_classifier.show_most_informative_features(10)) 

0.7675840978593272
Most Informative Features
               hiroshima = True                1 : 0      =     47.0 : 1.0
                 wildfir = True                1 : 0      =     43.7 : 1.0
                northern = True                1 : 0      =     38.3 : 1.0
                    atom = True                1 : 0      =     31.0 : 1.0
                   sever = True                1 : 0      =     28.3 : 1.0
                 typhoon = True                1 : 0      =     25.0 : 1.0
                 wreckag = True                1 : 0      =     23.0 : 1.0
                outbreak = True                1 : 0      =     21.0 : 1.0
                 confirm = True                1 : 0      =     19.8 : 1.0
             anniversari = True                1 : 0      =     19.7 : 1.0
None


In [130]:
show_metrics(ev_test_set)

Metrics for class True
true precision:  0.8770614692653673
true recall: 0.8944954128440367
true F-measure: 0.8856926570779713

Metrics for class False 
false precision: 0.8923556942277691
false recall: 0.8746177370030581
false F-measure: 0.8833976833976833

Confusion Matrix 
  |   0   1 |
--+---------+
0 |<572> 82 |
1 |  69<585>|
--+---------+
(row = reference; col = test)



F-scores have significantly improved. Now Let's evaluate it more and add the location.

# More evaluation 

Add location to the text, so it will appear in the bag of words. 

In [152]:
pd.set_option('display.max_colwidth', 100)
text = []
for i in range(0, len(data)):
    tweet = (data.loc[i].text + " " + str(data.loc[i].location) + " " + str(data.loc[i].keyword))
    text.append(tweet)

In [153]:
data1 = data
data1.text = text

In [154]:
data1.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all nan nan,1
1,4,,,Forest fire near La Ronge Sask. Canada nan nan,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California nan nan",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school nan nan,1


In [155]:
loc_ev_test_set, loc_ev_train_set = train_test_split_evoluated(data1)

loc_ev_classifier = NaiveBayesClassifier.train(loc_ev_train_set)
 
accuracy = classify.accuracy(loc_ev_classifier, loc_ev_test_set)
print(accuracy) 
 
print (loc_ev_classifier.show_most_informative_features(10)) 

0.7698776758409785
Most Informative Features
                 wildfir = True                1 : 0      =     43.7 : 1.0
                  bomber = True                1 : 0      =     31.7 : 1.0
                   sever = True                1 : 0      =     30.3 : 1.0
                 suspect = True                1 : 0      =     23.7 : 1.0
                 wreckag = True                1 : 0      =     23.7 : 1.0
                 typhoon = True                1 : 0      =     23.7 : 1.0
                    raze = True                1 : 0      =     20.3 : 1.0
                 rescuer = True                1 : 0      =     19.7 : 1.0
                    atom = True                1 : 0      =     17.8 : 1.0
                20bomber = True                1 : 0      =     17.7 : 1.0
None


In [156]:
show_metrics(loc_ev_test_set)

Metrics for class True
true precision:  0.8674698795180723
true recall: 0.8807339449541285
true F-measure: 0.8740515933232171

Metrics for class False 
false precision: 0.8788819875776398
false recall: 0.8654434250764526
false F-measure: 0.8721109399075502

Confusion Matrix 
  |   0   1 |
--+---------+
0 |<566> 88 |
1 |  78<576>|
--+---------+
(row = reference; col = test)



# Results

This practical work aimed to train and test Naive Bayesian Classifier. There were two phases of evaluation of the model.
Text in the first model did not contain any symbols, including hashtags and links. Accuracy was 0.769. F-score for class True was 0.77, for class False 0.76. 

For the second model hashtags and links were replaced with words hashtag and URL respectively. New accuracy draw up 0.767. F-score improved significantly. F-score for class True was 0.88, for class False 0.88. 

For the last evaluation, the location and keywords were added. After that F-scores decrease. Keywords and location made the model more complex, however, they did not contain any useful information for the target. 