In [373]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

In [374]:
import nltk;
nltk.download("popular");

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [375]:
class ProjectCorpus:

  def __init__(self, corpus):
    '''
    Takes a corpus, tokenizes it, and sets the internal variables needed
    to return the index of any given word (and what word has a given index)
    Args:
      corpus (csv): 
      The corpus is a csv,
      with the sentenses and classes (delimiter : tab)
    '''
    self.data = pd.read_csv(corpus, delimiter='\t', header=None)
    self.data.columns = ['Sentence', 'Class']
    self.data['index'] = data.index
    self.columns = ['index', 'Class', 'Sentence']

  def preprocess_pandas(self):
    df_ = pd.DataFrame(columns=self.columns)
    self.preprocessed_data = self.data.copy()
    self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].str.lower()
    self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)                      # remove emails
    self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)    # remove IP address
    self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters
    self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].replace('\d', '', regex=True)                                                                                         # remove numbers
    for index, row in self.preprocessed_data .iterrows():
        word_tokens = word_tokenize(row['Sentence'])
        filtered_sent = [w for w in word_tokens if not w in stopwords.words('english')]
        df_ = df_.append({
            "index": row['index'],
            "Class": row['Class'],
            "Sentence": " ".join(filtered_sent[0:])
        }, ignore_index=True)
    return self.preprocessed_data 

  def data_split(self, tsize, randomsize, shuffleTF):
    if hasattr(self, 'preprocessed_data'):
      self.training_data, self.validation_data, self.training_labels, self.validation_labels = train_test_split( # split the data into training, validation, and test splits
      self.preprocessed_data['Sentence'].values.astype('U'),
      self.preprocessed_data['Class'].values.astype('int32'),
      test_size=tsize,
      random_state=randomsize,
      shuffle=shuffleTF)
    else:
      self.training_data, self.validation_data, self.training_labels, self.validation_labels = train_test_split( # split the data into training, validation, and test splits
      self.data['Sentence'].values.astype('U'),
      self.data['Class'].values.astype('int32'),
      test_size=tsize,
      random_state=randomsize,
      shuffle=shuffleTF)



In [377]:
class CorpusTraining:

  def data_split(self, df, tsize, randomsize, shuffleTF):
    if hasattr(df, 'preprocessed_data'):
      self.training_data, self.validation_data, self.training_labels, self.validation_labels = train_test_split( # split the data into training, validation, and test splits
      df.preprocessed_data['Sentence'].values.astype('U'),
      df.preprocessed_data['Class'].values.astype('int32'),
      test_size=tsize,
      random_state=randomsize,
      shuffle=shuffleTF)
    else:
      self.training_data, self.validation_data, self.training_labels, self.validation_labels = train_test_split( # split the data into training, validation, and test splits
      df.data['Sentence'].values.astype('U'),
      df.data['Class'].values.astype('int32'),
      test_size=tsize,
      random_state=randomsize,
      shuffle=shuffleTF)
    
    self.classes = len(df.data['Class'].unique())

    return None

  def vectorize1(self):
    self.word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')

    self.training_vectors = self.word_vectorizer.fit_transform(self.training_data)
    self.training_vectors = self.training_vectors.todense()
    self.vocab_size = len(self.word_vectorizer.vocabulary_)
    self.validation_vectors = self.word_vectorizer.transform(self.validation_data)
    self.validation_vectors = self.validation_vectors.todense()
    self.train_x_tensor = torch.from_numpy(np.array(self.training_vectors)).type(torch.FloatTensor)
    self.train_y_tensor = torch.from_numpy(np.array(self.training_labels)).long()
    self.validation_x_tensor = torch.from_numpy(np.array(self.validation_vectors)).type(torch.FloatTensor)
    self.validation_y_tensor = torch.from_numpy(np.array(self.validation_labels)).long()


  def vectorize2(self):
    self.word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')

    self.all_vectors = self.word_vectorizer.fit_transform(np.concatenate((self.training_data, self.validation_data)))
    self.all_vectors = self.all_vectors.todense()
    self.vocab_size = len(self.word_vectorizer.vocabulary_)

    self.training_vectors = self.word_vectorizer.transform(self.training_data)
    self.training_vectors = self.training_vectors.todense()
    
    self.validation_vectors = self.word_vectorizer.transform(self.validation_data)
    self.validation_vectors = self.validation_vectors.todense()

    # self.train_x_tensor = torch.from_numpy(np.array(self.training_vectors)).type(torch.FloatTensor)
    # self.train_y_tensor = torch.from_numpy(np.array(self.training_labels)).long()

    # self.train_dataset = [torch.from_numpy(np.array(self.training_vectors)).type(torch.FloatTensor),
    #                       torch.from_numpy(np.array(self.training_labels)).long()
    #                       ]

    self.train_dataset = [[x, y] for x,y in zip(torch.from_numpy(np.array(self.training_vectors)).type(torch.FloatTensor),torch.from_numpy(np.array(self.training_labels)).long())]


    # self.validation_x_tensor = torch.from_numpy(np.array(self.validation_vectors)).type(torch.FloatTensor)
    # self.validation_y_tensor = torch.from_numpy(np.array(self.validation_labels)).long()

    # self.validation_dataset = [torch.from_numpy(np.array(self.validation_vectors)).type(torch.FloatTensor),
    #                            torch.from_numpy(np.array(self.validation_labels)).long()
    #                            ]

    self.validation_dataset = [[x, y] for x,y in zip(torch.from_numpy(np.array(self.validation_vectors)).type(torch.FloatTensor),torch.from_numpy(np.array(self.validation_labels)).long())]

  def token2index(self, token):
    return self.word_vectorizer.vocabulary_[token]


  def index2token(self, idx):
    return list(self.word_vectorizer.vocabulary_.keys())[list(self.word_vectorizer.vocabulary_.values()).index(idx)]

  def nn_sequential(self,hidden, lrate):
    self.network = nn.Sequential( 
    nn.Linear(self.vocab_size, hidden),
    nn.ReLU(),
    nn.Linear(hidden, self.classes)
    )

    self.optimizer = torch.optim.Adam(self.network.parameters(), lr = lrate)
    self.loss_function = nn.CrossEntropyLoss() 

  def train(self, epochs, batch_size):
    for epoch in range(epochs):
      for batch_nr, (data, labels) in enumerate(DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)):
        print(batch_nr)


In [384]:
df = ProjectCorpus("amazon_cells_labelled.txt")

In [385]:
df.preprocess_pandas();

  self.preprocessed_data ['Sentence'] = self.preprocessed_data ['Sentence'].str.replace('[^\w\s]','')                                                       # remove special characters


In [386]:
mytrainer = CorpusTraining()
mytrainer.data_split(df, 0.10, 0, True)

In [387]:
mytrainer.vectorize2()

In [388]:
mytrainer.nn_sequential(100, 0.02)

In [389]:
mytrainer.train(10,100) # (epochs, batch size)

0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8
0
1
2
3
4
5
6
7
8


# **TEST AREA**

In [391]:
for batch_nr, (data, labels) in enumerate(DataLoader(mytrainer.train_dataset, batch_size=10, shuffle=True)):
  print(batch_nr)
  print(data)
  print(labels)
  print("------")
  if batch_nr>2:
    break


0
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([1, 0, 1, 1, 1, 1, 0, 1, 0, 0])
------
1
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0, 0, 1, 1, 0, 1, 0, 0, 0, 0])
------
2
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 0])
------
3
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0.,

In [227]:
mytrainer.token2index('sister')

5816

In [228]:
mytrainer.index2token(5816)

'sister'