# Imports

In [1]:
import nltk
from   nltk.corpus import stopwords
from   nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import os

import pandas as pd
import regex  as re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [2]:
tt = pd.read_csv('./archive/tweets_data.csv')

In [3]:
tt.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet
0,0,1,0,@user when a father is dysfunctional and is s...
1,1,2,0,@user @user thanks for #lyft credit i can't us...
2,2,3,0,bihday your majesty
3,3,4,0,#model i love u take with u all the time in ...
4,4,5,0,factsguide: society now #motivation


Labels: <br>
    0 = Positive sentiment <br>
    1 = Negative sentiment

# Remoção de  colunas não utilizadas

In [4]:
tt.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)

In [5]:
tt.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


### Lower Casing

In [6]:
tt['tweet'] = tt.tweet.str.lower()

### Remoção de mentions e hashtags

In [7]:
def remove_mentions(sentence):
    no_mention = re.sub(r"@\S+", "", sentence)
    return no_mention

def remove_hashtags(sentence):
    no_hashtag = re.sub(r"#", "", sentence)
    return no_hashtag

In [8]:
tt['tweet'] = tt.tweet.apply(remove_mentions)
tt['tweet'] = tt.tweet.apply(remove_hashtags)

In [9]:
tt.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for lyft credit i can't use cause the...
2,0,bihday your majesty
3,0,model i love u take with u all the time in u...
4,0,factsguide: society now motivation


### Remoção de pontuação 

In [10]:
def remove_punctuation(sentence):
    punctuation = ['.','?',',','!', ':', ';']
    translate_table = str.maketrans(dict.fromkeys(punctuation, ''))
    no_punctuation = sentence.translate(translate_table)
    
    return no_punctuation

In [11]:
tt['tweet'] = tt.tweet.apply(remove_punctuation)

### Remoção de Stopwords

In [12]:
stopwordz = stopwords.words('english')

In [13]:
def remove_stopwords(sentence):
    sentence = sentence.split()
    list_sentence = [word for word in sentence if word not in stopwordz]
    return ' '.join(list_sentence)

In [14]:
tt['tweet'] = tt.tweet.apply(remove_stopwords)

In [15]:
tt.head()

Unnamed: 0,label,tweet
0,0,father dysfunctional selfish drags kids dysfun...
1,0,thanks lyft credit can't use cause offer wheel...
2,0,bihday majesty
3,0,model love u take u time urð± ðððð...
4,0,factsguide society motivation


### Remove Hyperlinks

In [16]:
def remove_hyperlink(sentence):       
    return re.sub(r"http\S+", "", sentence)

In [17]:
tt['tweet'] = tt.tweet.apply(remove_hyperlink)

### Tokenizing 

In [18]:
tt['tweet'] = tt['tweet'].apply(word_tokenize)

### Stemming 

In [19]:
def stem_words(sentence):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in sentence])

In [20]:
tt['tweet'] = tt.tweet.apply(stem_words)

In [21]:
tt.head()

Unnamed: 0,label,tweet
0,0,father dysfunct selfish drag kid dysfunct run
1,0,thank lyft credit ca n't use caus offer wheelc...
2,0,bihday majesti
3,0,model love u take u time urð± ðððð...
4,0,factsguid societi motiv


## Embedding

In [40]:
bow = CountVectorizer(max_features =100000,min_df=2)

In [41]:
embedded_input = bow.fit_transform(tt.tweet).toarray()

In [42]:
embedded_input.shape

(31962, 12723)

## Feture Engineering

In [43]:
X = embedded_input
y = tt.label

In [None]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size = 0.2)                                    