In [2]:
import nltk
nltk.data.path.append("/Users/danielcanhedo/nltk_data")
from nltk.corpus import stopwords
print(stopwords.words("english"))  # finally works
from nltk.corpus import stopwords # stop works are commonly used words that do not provide value to training model
from nltk.stem import PorterStemmer # porterstemmer converts similar words to one type of word. for example, running, runner becomes run
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
import joblib


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
dataset = pd.read_csv("Datasets/test_data.csv") # imports our training data set


In [6]:
type(dataset.text)

pandas.core.series.Series

In [4]:
# data cleaning portion ----------------------------------------------------------------

ps = PorterStemmer()
corpus = []

for i in range(len(dataset)):

    comments = re.sub("[^a-zA-Z]", " ", str(dataset["text"][i])) # we are only going to keep a-z A-Z letters
    comments = comments.lower() # converts to lowercase for consistency
    comments = comments.split() # splits every word by space. we now have individual words

    clean_comments = [] # initializes a list to store cleaned comments
    for word in comments:

        if word not in set(stopwords.words('english')): # this for loop will get rid of stop words and apply stemming (explained above)
            stemmed_word = ps.stem(word)
            clean_comments.append(stemmed_word)

    clean_comments = " ".join(clean_comments)
    corpus.append(clean_comments) # we now have a corpus of clean sentences

In [5]:

# we will now convert the sentences to vectors to send through our model

# TFIDF stands for Term-Frequency-Inverse Document Frequency. It measures how important a word is to a document relative to a corpus
# (collection) of documents.
# Term Frequency (TF) - measures how often a word appears in a document
# Inverse Document Frequency (IDF) - measures how unique or rare a word is across all documents

# TF = # Times word occurs in a document / # total number of words in document
# IDF = log((Total # of documents / Number of documents containing the term) + 1)

# Rare words across documents get a high IDF
# Common words appear in many documents so their IDF is lower
# Think of documents as a sentence

# see the chart (where the number is higher, the more important that word is in that document)
#       bird    cat    dog
# D1    0.00    0.52   0.00
# D2    0.00    0.00   0.52
# D3    0.52    0.00   0.00

# same goes for our matrix if we do X = vectorizer.fit_transform(corupus).toarray()

vectorizer = TfidfVectorizer(max_features=20000, min_df=0.01, max_df=0.9) # max number of words, the amount of times for word to occur to be considered, and then max_df gets rif of words that occur mroe than a certain type
X = vectorizer.fit_transform(corpus).toarray()
print(X[0])

y = dataset['boolean'].values  # gets our boolean row and converts to numpy array
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # splits our dataset into training set and test set

X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()

Y_train = torch.from_numpy(y_train).long()
y_test = torch.from_numpy(y_test).long()


print(X_train.shape)
print(Y_train.shape)
print()
print(X_test.shape)
print(y_test.shape)

# all of this prints:
# torch.Size([1640, 186])
# torch.Size([1640])
#
# torch.Size([410, 186])
# torch.Size([410])
# all of this means we have 1640+410 = 2050 sentences in the corpus, 186 vectorized features (need to understand this more)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.254556   0.
 0.         0.         0.         0.         0.47158443 

In [6]:
input_size = X_train.shape[1] # the number of nodes we have
output_size = 2 # we are predicting whether true or false so it is 2
hidden_size = 500 # this can change but lets try with 500

# let's now build the neural network ourselves like we learned in datacamp
model = nn.Sequential(
    nn.Linear(input_size, hidden_size), # builds our first layer. remember they must link with the same hidden_size
    nn.ReLU(), # introduces non-linearity so it can learn more complex relationships (look more into this)
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size),
    nn.LogSoftmax(dim=1)) # we use the softmax activation function which is for classification

In [7]:
# the optimizer is what changes the weights and biases during training so it can learn better, this minimizes the loss function
# it improves the accuracy of the neural network by using gradient descent
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# the loss function is what holds the value for how bad the model performaed. it calculates the difference between the model's
# prediction and the actual label
loss_fn = nn.NLLLoss()

In [8]:
# now we will actually train the neural network
epochs = 100

for epoch in range(epochs):
    optimizer.zero_grad()
    Y_pred = model(X_train)
    loss = loss_fn(Y_pred, Y_train) # loss function which computes how different predictions are from actual label
    loss.backward() # backward pass to optimize
    optimizer.step() # optimizes the weights to minimize loss
    print("Epoch", epoch, "Loss:", loss.item())

# so perhaps we can find a way to combine our text classifier with a DecisionTreeRegressor which would take
# into account like count, reply count, etc.

Epoch 0 Loss: 0.6954460144042969
Epoch 1 Loss: 0.9930170774459839
Epoch 2 Loss: 0.7746158838272095
Epoch 3 Loss: 0.7214235663414001
Epoch 4 Loss: 0.6184757947921753
Epoch 5 Loss: 0.5288136005401611
Epoch 6 Loss: 0.39465853571891785
Epoch 7 Loss: 0.2735481858253479
Epoch 8 Loss: 0.1239589974284172
Epoch 9 Loss: 0.09215084463357925
Epoch 10 Loss: 0.07829464226961136
Epoch 11 Loss: 0.06542792171239853
Epoch 12 Loss: 0.029722051694989204
Epoch 13 Loss: 0.026297835633158684
Epoch 14 Loss: 0.013192501850426197
Epoch 15 Loss: 0.0071979425847530365
Epoch 16 Loss: 0.007180241402238607
Epoch 17 Loss: 0.004920115694403648
Epoch 18 Loss: 0.002641266444697976
Epoch 19 Loss: 0.0015569068491458893
Epoch 20 Loss: 0.000871548370923847
Epoch 21 Loss: 0.0006010316428728402
Epoch 22 Loss: 0.0004266426258254796
Epoch 23 Loss: 0.00022284603619482368
Epoch 24 Loss: 9.060841694008559e-05
Epoch 25 Loss: 4.6178101911209524e-05
Epoch 26 Loss: 2.7194882932235487e-05
Epoch 27 Loss: 1.6859536117408425e-05
Epoch 28 

In [9]:
# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
torch.save(model.state_dict(), 'model_weights.pth')
# Save entire model
torch.save(model, 'full_model.pt')