[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/S2DSLondon/Aug20_FSA/blob/Anna-wip/NLP%20practice-Anna/NLTK_Classifier.ipynb)

In [1]:
import pandas as pd
import numpy as np
import random
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.tag import pos_tag
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re, string
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk import classify
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
#nltk.download('wordnet')

# The script is seperated into two parts

### 1) Train the Model

Setting:

    Divide train-test set
    Whether to save the classifier or not
    
Preprocessing

    Read training data
    Checking how many NaN
    Combine all words (location, keyword, text) into one column
    Seperate disaster and not disaster dataset
    Make positive/negative token lists using tweet tokenizer
    Lemmetize sentences
    Extracting the most frequent words for sentiment analyses
    Train Model
    Confusion Matrix
    Optional: Save the classifier for future use
    
### 2) Predict unlabeled data on the trained classifier above (or load existing classifier)

preprocess data
    
    classify
    save result

# 1) Train The Model

### Whether to save the classifier or not

In [2]:
save_classifier = True
filename = 'Naive_Bayes_NLTK_v1'

### Read training data
Create a seperate data

In [3]:
# Change this to the train file you want to read
id_name = 'train.csv'
raw_data = pd.read_csv(id_name)
data = raw_data
data.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


### Checking how many NaN

In [4]:
print(sum(pd.isnull(data.keyword)), sum(pd.isnull(data.location)))

61 2533


### Combine all words (location, keyword, text) into one column

In [5]:
keyword_filled = data.keyword.fillna('')
location_filled = data.location.fillna('')
data['all_words'] = data.text + ' ' + keyword_filled + ' ' + location_filled

### Seperate disaster and not disaster dataset

In [6]:
rating = pd.to_numeric(data['target'])
positive = data['all_words'][rating ==1] # Disaster = 1
negative = data['all_words'][rating ==0] # 
print('Disaster: ', len(positive), 'No Disaster: ', len(negative))

Disaster:  3271 No Disaster:  4342


### Make positive/negative token lists using tweet tokenizer

In [7]:
positive_tokens = list()
negative_tokens = list()

tweet_tokenizer = TweetTokenizer()

#tokenizer = RegexpTokenizer(r'\w+') # regular expression to take out the symbols

for pos_sentence  in positive:
    add_pos_sentence = tweet_tokenizer.tokenize(pos_sentence)
    positive_tokens.append(add_pos_sentence)
    
for neg_sentence in negative:
    add_neg_sentence = tweet_tokenizer.tokenize(neg_sentence)
    negative_tokens.append(add_neg_sentence)

### Lemmetize sentences

In [8]:
# add additional words to stop words
stop_words = stopwords.words('english')
add_words = ["...", "'"]
stop_words = stop_words + add_words

In [9]:
# Function to lemmitize and clean words
def cleaned_words (tokens, stop_words): # lemmatize sentence, omit punctuation and stop words
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else: # all the rest tagged with a
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(token, pos)
        
        if len(word) > 0 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_tokens.append(word.lower())
    return cleaned_tokens

In [10]:
positive_cleaned_tokens_list=list()
negative_cleaned_tokens_list=list()
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

In [11]:
print(negative_cleaned_tokens_list[:10])

[["what's", 'man'], ['love', 'fruit'], ['summer', 'lovely'], ['car', 'fast'], ['goooooooaaaaaal'], ['ridiculous'], ['london', 'cool', ';)'], ['love', 'skiing'], ['wonderful', 'day'], ['looooool']]


### Extracting the most frequent words for own understanding

In [12]:
# Get word frequecy

def get_all_words(cleaned_tokens_list): 
    all_words = []
    for tokens in cleaned_tokens_list:
        for token in tokens:
            all_words.append(token)
    return all_words

all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_pos_words = get_all_words(positive_cleaned_tokens_list)

from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))

[('\x89', 383), ('fire', 291), ('suicide', 204), ('û_', 171), ('bomb', 164), ('california', 147), ('crash', 143), ('flood', 142), ('kill', 141), ('building', 137)]
[('\x89', 442), ('new', 301), ('get', 298), ('like', 295), ('body', 216), ("i'm", 207), ('go', 190), ('û_', 171), ('scream', 152), ('wreck', 141)]


In [13]:
# Produce dictionary (in generator) for NLTK Naive Bayes Classifier, as it only takes dictionary

def get_dict_for_model(cleaned_tokens_list):
    for list_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in list_tokens)

positive_tokens_for_model = get_dict_for_model(positive_cleaned_tokens_list) 
negative_tokens_for_model = get_dict_for_model(negative_cleaned_tokens_list) 
# Convert generator into list
pos_dataset = [(dict_word, 1) 
               for dict_word in positive_tokens_for_model]
neg_dataset = [(dict_word, 0) 
               for dict_word in negative_tokens_for_model]

In [14]:
# Shuffling the dataset, and seperate into train and test(verification) set
all_set = pos_dataset + neg_dataset
random.shuffle(all_set)
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(all_set, test_size=0.33, random_state=42)

### Train Model

In [15]:
classifier = NaiveBayesClassifier.train(train_set)
print(classifier.labels())
print("Accuracy is:", classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

[0, 1]
Accuracy is: 0.7711898129725427
Most Informative Features
               hiroshima = True                1 : 0      =     38.9 : 1.0
                 typhoon = True                1 : 0      =     29.1 : 1.0
                wildfire = True                1 : 0      =     28.7 : 1.0
                      70 = True                1 : 0      =     27.3 : 1.0
                outbreak = True                1 : 0      =     23.7 : 1.0
                   spill = True                1 : 0      =     23.7 : 1.0
                      40 = True                1 : 0      =     21.9 : 1.0
                 20spill = True                1 : 0      =     21.9 : 1.0
                 suicide = True                1 : 0      =     21.0 : 1.0
None


### Generate Confusion Matrix

In [16]:
from nltk.metrics import ConfusionMatrix
test_tag = [tag[0] for tag in test_set]
test_label = [sent[1] for sent in test_set]
model_label = classifier.classify_many(test_tag)
cm = ConfusionMatrix(test_label, model_label)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

  |      0      1 |
--+---------------+
0 | <43.7%> 12.7% |
1 |  10.1% <33.4%>|
--+---------------+
(row = reference; col = test)



### Optional: Save the classifier for future use

In [17]:
if save_classifier == True:
    import pickle
    seperator = ''
    filename = seperator.join([filename, '.pickle'])
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close()

# 2) Predict unlabeled data on the trained classifier above (or load existing classifier

In [18]:
def import_classifier(classifier):
    import pickle
    f = open(classifier, 'rb')
    classifier = pickle.load(f)
    return classifier
    #f.close()

In [19]:
# Change this to the review file you want to read
#classifier_id = 'Naive_Bayes_Classifier'
id_name = 'test.csv'
test_data = pd.read_csv(id_name)
#import_classifier(classifier_id)

### Preprocess unlabeled test data the same way as we did for the training data
1) combine all words

2) get tokenized words

3) make dictionary (without label this time)

In [20]:
keyword_filled = test_data.keyword.fillna('')
location_filled = test_data.location.fillna('')
test_data['all_words'] = test_data.text + ' ' + keyword_filled + ' ' + location_filled
test_token=list()
cleaned_tokens_list = list()

for sent in test_data['all_words']:
    add_sent = tweet_tokenizer.tokenize(sent)
    test_token.append(add_sent)
for tokens in test_token:
    cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

tokens_for_model = get_dict_for_model(cleaned_tokens_list) 
test_set = [dict_word for dict_word in tokens_for_model]


### Apply classifier to the unlabeled data

In [21]:
test_tag = [tag for tag in test_set]
result = pd.DataFrame({'id': test_data.id})
result['target'] = classifier.classify_many(test_set)

### Save result

In [22]:
result.to_csv('result_NLTK.csv', index = False)