In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from torchtext.legacy import data
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy.datasets import IMDB
from torchtext.legacy.data import Field, LabelField, BucketIterator

In [4]:
import googletrans
from googletrans import Translator

In [26]:
import os
for dirname, _, filenames in os.walk('stanfordSentimentTreebank'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

stanfordSentimentTreebank/SOStr.txt
stanfordSentimentTreebank/sentiment_labels.txt
stanfordSentimentTreebank/README.txt
stanfordSentimentTreebank/original_rt_snippets.txt
stanfordSentimentTreebank/datasetSplit.txt
stanfordSentimentTreebank/dictionary.txt
stanfordSentimentTreebank/STree.txt
stanfordSentimentTreebank/datasetSentences.txt
stanfordSentimentTreebank/.ipynb_checkpoints/README-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/datasetSplit-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/datasetSentences-checkpoint.txt
stanfordSentimentTreebank/.ipynb_checkpoints/sentiment_labels-checkpoint.txt


In [41]:
class StanfordDatasetReader():
    def __init__(self, sst_dir):
        merged_dataset = self.get_merged_dataset(sst_dir)
#         self.dataset = merged_dataset[merged_dataset["splitset_label"] == split_idx]
        self.dataset = merged_dataset

    def get_merged_dataset(self, sst_dir):
        sentiment_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), sep="|")
        sentence_ids = pd.read_csv(os.path.join(sst_dir, "datasetSentences.txt"), sep="\t")
        dictionary = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep="|", names=['phrase', 'phrase ids'])
        train_test_split = pd.read_csv(os.path.join(sst_dir, "datasetSplit.txt"))
        sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')
        sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
        return pd.merge(sentence_phrase_split, sentiment_labels, on='phrase ids').sample(frac=1)

    def discretize_label(self, label):
        if label <= 0.2: return 0
        if label <= 0.4: return 1
        if label <= 0.6: return 2
        if label <= 0.8: return 3
        return 4

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
#         return {"sentence": self.dataset.iloc[idx, 1],"label": self.discretize_label(self.dataset.iloc[idx, 5])}
        return (self.dataset.iloc[idx, 1],self.discretize_label(self.dataset.iloc[idx, 5]))
        

In [42]:
def build_vocab(sentences):
    w2i = defaultdict(lambda : len(w2i))
    w2i["<PAD>"] += 1
    w2i["<OOV>"] += 1
    w2i["<UNK>"] += 1
    sentences = list(sentences)
    for s in sentences:
        words = s.split()
        for w in words:
            if w not in w2i:
                w2i[w] += 1    
    return w2i

In [57]:
train_data = StanfordDatasetReader('stanfordSentimentTreebank/',1)
test_data = StanfordDatasetReader('stanfordSentimentTreebank/',2)
valid_data = StanfordDatasetReader('stanfordSentimentTreebank/',3)

In [56]:
cnt = 0
for i in data:
    print(i)
    cnt+=1
    if cnt ==5:
        break

("The audacity to view one of Shakespeare 's better known tragedies as a dark comedy is , by itself , deserving of discussion .", 1)
("It 's so devoid of joy and energy it makes even Jason X ... look positively Shakesperean by comparison .", 0)
('Maintains your interest until the end and even leaves you with a few lingering animated thoughts .', 3)
('A comic gem with some serious sparkles .', 4)
("You want to call Domino 's .", 2)


In [45]:
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')

In [47]:
for j in train_iter:
    print (j)
    break
    

(3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')


In [48]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))

vocab = Vocab(counter, min_freq=1)

In [51]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fb4309644f0>>,
            {'<unk>': 0,
             '<pad>': 1,
             '.': 2,
             'the': 3,
             ',': 4,
             'to': 5,
             'a': 6,
             'of': 7,
             'in': 8,
             'and': 9,
             's': 10,
             'on': 11,
             'for': 12,
             '#39': 13,
             '(': 14,
             ')': 15,
             '-': 16,
             "'": 17,
             'that': 18,
             'with': 19,
             'as': 20,
             'at': 21,
             'is': 22,
             'its': 23,
             'new': 24,
             'by': 25,
             'it': 26,
             'said': 27,
             'reuters': 28,
             'has': 29,
             'from': 30,
             'an': 31,
             'ap': 32,
             'his': 33,
             'will': 34,
             'after': 35,
             'was': 36,
             'us': 37,
      

In [58]:
import os

In [59]:
os.listdir()

['stanfordSentimentTreebank.zip',
 '.data',
 'LSTM_sentiment_analysis_colab.ipynb',
 'df_all.csv',
 'From_Sentiment_Analysis_using_LSTM_RNN.ipynb',
 'Untitled.ipynb',
 'stanfordSentimentTreebank',
 'Untitled1.ipynb',
 '.ipynb_checkpoints']

In [61]:
df = pd.read_csv('df_all.csv')

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,target
0,0,"If this disposable tissue has one wild card , ...",2
1,1,Catch it ... if you can !,3
2,2,... the implication is Kissinger may have deci...,2
3,3,"Chaotic , self-indulgent and remarkably ugly t...",1
4,4,The film 's real appeal wo n't be to Clooney f...,3


In [63]:
df = df[['sentence','target']]

In [64]:
df.columns = ['tweets','labels']

In [66]:
df.to_csv('df_all1.csv')