In [37]:
import numpy as np
import pandas as pd

In [38]:
train_data=pd.read_csv('datasets/twitter_training.csv',header=None)
validation_data=pd.read_csv('datasets/twitter_validation.csv',header=None)
train_data

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


# Data clearning

In [39]:
train_data=train_data.drop([0,1],axis=1)
train_data.describe()

Unnamed: 0,2,3
count,74682,73996
unique,4,69491
top,Negative,"At the same time, despite the fact that there ..."
freq,22542,172


In [40]:
train_data=train_data.rename(columns={2:'Sentiment',3:'Tweet'})
train_data.isna().sum()

Sentiment      0
Tweet        686
dtype: int64

In [41]:
train_data.dropna(inplace=True)
train_data=train_data[train_data['Sentiment']!='Irrelevant']
train_data=train_data.drop_duplicates(keep = 'first')
train_data

Unnamed: 0,Sentiment,Tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [42]:
X_train=train_data['Tweet']
Y_train=train_data['Sentiment']
Y_train=pd.get_dummies(Y_train)

In [43]:
Y_train.sum()

Negative    21237
Neutral     17110
Positive    19138
dtype: int64

# Data Preprocessing
- lower case
- removing punctuation
- removing all single words

In [44]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

train_data['Tweet'] = [i.lower() for i in train_data['Tweet']]
train_data['Tweet'] = [i.translate(str.maketrans('', '', string.punctuation)) for i in train_data['Tweet']]
train_data['Tweet'] =  [re.sub(r"\b[a-zA-Z]\b", "", i) for i in train_data['Tweet']] 
train_data['Tweet']

0        im getting on borderlands and  will murder you...
1          am coming to the borders and  will kill you all
2         im getting on borderlands and  will kill you all
3        im coming on borderlands and  will murder you all
4        im getting on borderlands 2 and  will murder y...
                               ...                        
74677    just realized that the windows partition of my...
74678    just realized that my mac window partition is ...
74679    just realized the windows partition of my mac ...
74680    just realized between the windows partition of...
74681    just like the windows partition of my mac is l...
Name: Tweet, Length: 57485, dtype: object

## Tokenization and removing stopwords

In [45]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')

def remove_stopwords(r):
    word_tokens = word_tokenize(r)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

train_data['Tweet']= [remove_stopwords(r) for r in train_data['Tweet']]
train_data

Unnamed: 0,Sentiment,Tweet
0,Positive,"[im, getting, borderlands, murder]"
1,Positive,"[coming, borders, kill]"
2,Positive,"[im, getting, borderlands, kill]"
3,Positive,"[im, coming, borderlands, murder]"
4,Positive,"[im, getting, borderlands, 2, murder]"
...,...,...
74677,Positive,"[realized, windows, partition, mac, like, 6, y..."
74678,Positive,"[realized, mac, window, partition, 6, years, b..."
74679,Positive,"[realized, windows, partition, mac, 6, years, ..."
74680,Positive,"[realized, windows, partition, mac, like, 6, y..."


## Lemmatization

In [46]:
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

def lemmatization(line):
    filtered_words = []
    for w in line:
        filtered_words.append(lemmatizer.lemmatize(w))
    return filtered_words

train_data['Tweet']= [lemmatization(line) for line in train_data['Tweet']]
train_data['Tweet']

[nltk_data] Downloading package omw-1.4 to /home/mariia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0                        [im, getting, borderland, murder]
1                                   [coming, border, kill]
2                          [im, getting, borderland, kill]
3                         [im, coming, borderland, murder]
4                     [im, getting, borderland, 2, murder]
                               ...                        
74677    [realized, window, partition, mac, like, 6, ye...
74678    [realized, mac, window, partition, 6, year, be...
74679    [realized, window, partition, mac, 6, year, be...
74680    [realized, window, partition, mac, like, 6, ye...
74681    [like, window, partition, mac, like, 6, year, ...
Name: Tweet, Length: 57485, dtype: object