# Import Python Libraries

In [31]:
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report



# Import our metrics to evaluate our model
from sklearn import metrics


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#needed for cleaning text
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/edgar/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /Users/edgar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/edgar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/edgar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Read the data

In [14]:
df = pd.read_csv('data/training.csv')
print(df.shape)
df.head()

(16000, 2)


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


# Check For Nulls And Dupes

In [9]:
print(df.isnull().sum())

text     0
label    0
dtype: int64


In [10]:
print(df.duplicated().sum())

1


In [15]:
#Dropping dupilcates
df = df.drop_duplicates()

In [16]:
#check if dropped
print(df.duplicated().sum())

0


# Value Counts for Labels
0 - Sadness, 
1 - Joy, 
2 - Love, 
3 - Anger, 
4 - Fear

In [18]:
#Checking our balances
df.label.value_counts(sort = False)

0    4666
1    5361
2    1304
3    2159
4    1937
5     572
Name: label, dtype: int64

# Feature Engineering
1. Lowercase all words
2. Remove punctuation
3. Remove stopwords
4. Stem words

In [28]:
# Lowercase all words
test_string = 'This is A SENTENCE with LOTS OF CAPS.'
def make_lower(a_string):
    return a_string.lower()
make_lower(test_string)

'this is a sentence with lots of caps.'

In [29]:
# Remove all punctuation
test_string = 'This is a sentence! 50 With lots of punctuation??? & other #things.'
def remove_punc(a_string):    
    a_string = re.sub(r'[^\w\s]','',a_string)
    return a_string
remove_punc(test_string)

'This is a sentence 50 With lots of punctuation  other things'

In [22]:
# Make a function that stemms all words.
porter = PorterStemmer()
test_string = 'I played and started playing with players and we all love to play with plays'

def stem_words(a_string):
    words = word_tokenize(a_string)
    valid_words = []
    
    for word in words:
        valid_words.append(porter.stem(word))
        
    return ' '.join(valid_words)
stem_words(test_string)

'I play and start play with player and we all love to play with play'

In [32]:
# Lemmatize words with pos-tags

def convert_pos(pos):
    if pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lem_with_pos_tag(a_string):
    # Initalize our Lemmer
    lemmatizer = WordNetLemmatizer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Get the word and pos_tag for each of the words. 
    tagged_words = nltk.pos_tag(words)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in tagged_words:
        
        # The word is the first element in the tuple
        the_word = word[0]
        
        # The pos_tag is the second element in the tuple
        the_pos_tag = word[1]
        
        # Convert the pos_tag into the format the lemmatizer accepts
        the_pos_tag = convert_pos(the_pos_tag)

        # Lemmatize the word with the pos_tag
        lemmed_word = lemmatizer.lemmatize(the_word, the_pos_tag)
        
        # Append stemmed word to our valid_words
        valid_words.append(lemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

a_sentence = 'I played and started playing with players and we all love to play with plays'
another_sentence = 'This is because she wanted to go outside with her friends and play basketball.'
lem_with_pos_tag(another_sentence)

'This be because she want to go outside with her friend and play basketball .'

# Create a pipeline to put text through

In [42]:
#pipeline of all functions
test_string = 'I felt so happy when we won the championship!'

# this function does not apply stemming or lemmatization
def apply_no_nlp(a_string):
    a_string = make_lower(a_string)
    a_string = remove_punc(a_string)
    return a_string
apply_no_nlp(test_string)


'i felt so happy when we won the championship'

In [43]:
# this function applys stemming
def apply_stem(a_string):
    a_string = make_lower(a_string)
    a_string = remove_punc(a_string)
    a_string = stem_words(a_string)
    return a_string
apply_stem(test_string)


'i felt so happi when we won the championship'

In [44]:
# this function applys lemmatization
def apply_lem(a_string):
    a_string = make_lower(a_string)
    a_string = remove_punc(a_string)
    a_string = lem_with_pos_tag(a_string)
    return a_string
apply_lem(test_string)

'i felt so happy when we win the championship'