## Sentiment Analysis
We'll be using tweets to create a decisiontree model capable of classifying tweets as positive or negative.

In [1]:
import pandas as pd 


df = pd.read_csv('./data/tweets.csv')

df.head()

Unnamed: 0,Tweet,Sentiment
0,Space missions help us learn more about the un...,positive
1,They advance scientific knowledge and improve ...,positive
2,Space missions help to inspire the next genera...,positive
3,They provide opportunities for international c...,positive
4,Space missions help us to develop new technolo...,positive


In [3]:
df['Sentiment'].value_counts()

Sentiment
positive    31
negative    18
Name: count, dtype: int64

In [4]:
#preprocessing( lower case the text and remove punctuation)
import string

cleaned_tweet = []

for tweet in df['Tweet']:
    lower_tweet = tweet.lower()

    tweet_no_punc = ""
    for _ in lower_tweet:
        if _ not in string.punctuation:
            tweet_no_punc += _

    cleaned_tweet.append(tweet_no_punc)

df['tweets']  = cleaned_tweet




In [5]:
df.head()

Unnamed: 0,Tweet,Sentiment,tweets
0,Space missions help us learn more about the un...,positive,space missions help us learn more about the un...
1,They advance scientific knowledge and improve ...,positive,they advance scientific knowledge and improve ...
2,Space missions help to inspire the next genera...,positive,space missions help to inspire the next genera...
3,They provide opportunities for international c...,positive,they provide opportunities for international c...
4,Space missions help us to develop new technolo...,positive,space missions help us to develop new technolo...


In [10]:
###pos_tagging, lemmatization, tokenization and modelling
import nltk
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

pos_tagged_tweets = []

for tweet in df['tweets']:
    tokens = []

    #words to tokenize
    words = word_tokenize(tweet)
    for word in words:
        tokens.append(word)

    #pos_tagging
    tags_pos = pos_tag(tokens)

    pos_tagged_tweets.append(tags_pos)


df['pos_tags'] = pos_tagged_tweets

In [15]:
##lemmatization with pos_tags
def wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a' #adjective
    elif tag.startswith('V'):
        return 'v' #verb
    elif tag.startswith('R'):
        return 'r' #adverb
    elif tag.startswith('N'):
        return 'n' #noun
    else:
        return 'r'

#instantiate lemmatizer 
lemmatizer = WordNetLemmatizer()

lem_tweets = []

for tag_t in df['pos_tags']:
    lem_words = []

    for word, tag in tag_t:
        new_tag = wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, new_tag)
        lem_words.append(lemma)

    lem_tweets.append(lem_words)

df['lemmatized_pos'] = lem_tweets


    

In [14]:
df['lemmatized_pos'].head()

0    [space, mission, help, us, learn, more, about,...
1    [they, advance, scientific, knowledge, and, im...
2    [space, mission, help, to, inspire, the, next,...
3    [they, provide, opportunity, for, internationa...
4    [space, mission, help, us, to, develop, new, t...
Name: lemmatized_pos, dtype: object