In [2]:
import numpy as np
import pandas as pd



In [3]:
data = pd.read_csv('../dataset/sentiment_analysis.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Text Preprocessing

In [5]:
import re
import string

convert upper case to lower case

In [6]:
data["tweet"] =  data["tweet"].apply(lambda x:" ".join(x.lower() for x in x.split()))
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Links


In [7]:
data["tweet"] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Punctuations


In [8]:
def remove_punctuations(text):
    for punctuations in string.punctuation:
        text = text.replace(punctuations, ' ')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)
data["tweet"].head(5)

0     fingerprint  pregnancy test   android  apps  ...
1    finally a transparant silicon case    thanks t...
2    we love this  would you go   talk  makememorie...
3    i m wired i know i m george i was made that wa...
4    what amazing service  apple won t even talk to...
Name: tweet, dtype: object

Remove Numbers


In [9]:
data["tweet"] = data['tweet'].str.replace(r'\d+', '', regex=True)
data.head()


Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps ...
1,2,0,finally a transparant silicon case thanks t...
2,3,0,we love this would you go talk makememorie...
3,4,0,i m wired i know i m george i was made that wa...
4,5,1,what amazing service apple won t even talk to...


Remove Stopwords

In [10]:
import nltk

In [11]:
nltk.download('stopwords', download_dir='../Static/model')

[nltk_data] Downloading package stopwords to ../Static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [13]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [14]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

### Building Vacabulary

In [15]:
from collections import Counter
vocab = Counter()

In [16]:
for sentences in data["tweet"]:
    vocab.update(sentences.split())

In [17]:
tokens = [key for key in vocab if vocab[key] > 10]
len(tokens)

1169

In [18]:

def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

### Divede dataset for tarin and test

In [19]:
X = data["tweet"]
y = data["label"]

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Vectorization


In [21]:
def vectorizer (ds, vocabulary):
    vectorized_list = []

    for sentences in ds:
        sentences_list = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentences.split():
                sentences_list[i] = 1
        
        vectorized_list.append(sentences_list)

    vectorized_list_new = np.asarray(vectorized_list,dtype=np.float32)

    return vectorized_list_new

In [23]:
vectorized_x_train = vectorizer(X_train, tokens)
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
vectorized_x_test = vectorizer(X_test, tokens)

### Handle imbalance dataset

In [29]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9408, 1169) (9408,)
