#Setting up

##Connecting to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive

/content/drive


In [None]:
%cd MyDrive/StressPredictionProject/

/content/drive/MyDrive/StressPredictionProject


In [None]:
%pip install --user -U nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Importing libraries

In [None]:
import numpy as np
import sklearn 
import pandas as pd
import matplotlib as mp
import keras 
import csv 
import nltk
import pickle 

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk import WordNetLemmatizer

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [None]:
from keras.layers import SimpleRNN, Dense, Flatten, Conv1D, Dropout
from keras import regularizers

In [None]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#Extracting Data

##pickle

In [None]:
def save_data(data, file_name):
  with open(file_name, 'wb') as handle:
    pickle.dump(data, handle)

In [None]:
def get_data(file_name):
  with open(file_name, 'rb') as handle:
    data = pickle.load(handle)
  return data

##extracting raw data

In [None]:
data = pd.read_csv("Stress.csv")
        

In [None]:
data.head()

Unnamed: 0,subreddit,post_id,sentence_range,text,label,confidence,social_timestamp
0,ptsd,8601tu,"(15, 20)","He said he had not felt that way before, sugge...",1,0.8,1521614353
1,assistance,8lbrx9,"(0, 5)","Hey there r/assistance, Not sure if this is th...",0,1.0,1527009817
2,ptsd,9ch1zh,"(15, 20)",My mom then hit me with the newspaper and it s...,1,0.8,1535935605
3,relationships,7rorpp,"[5, 10]","until i met my new boyfriend, he is amazing, h...",1,0.6,1516429555
4,survivorsofabuse,9p2gbc,"[0, 5]",October is Domestic Violence Awareness Month a...,1,0.8,1539809005


Dropping unnecessary columns

In [None]:
data = data.drop(["post_id", "social_timestamp", "sentence_range"], axis = 1)

##extracting labels

In [None]:
labels = get_data("labels.pkl")

##tokenization

In [None]:
def tokenize(data):
    new_data = []
    for element in data:
      new_data.append(word_tokenize(element))
    return new_data

In [None]:
text = data["text"]

In [None]:
new_text = tokenize(text)

In [None]:
data["tokenized_text"] = new_text

In [None]:
print(new_text)



##stopword removal

In [None]:
#stopwords = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(data):
  new_data = []
  for words in data:
    filtered_sentence = [w for w in words if not w.lower() in stopwords]
    new_data.append(filtered_sentence)
  return new_data

In [None]:
new_data = remove_stopwords(new_text)

##Stemming

In [None]:
def stem(data):
  new_data = []
  stemmer = PorterStemmer()
  for words in data:
    new_words = [stemmer.stem(word) for word in words]
    new_data.append(new_words)
  return new_data


In [None]:
new_data = stem(new_data)

In [None]:
print(new_data)

[['said', 'felt', 'way', ',', 'sugget', 'go', 'rest', '..', 'trigger', 'ahead', 'youi', "'re", 'hypocondriac', 'like', ':', 'decid', 'look', '``', 'feel', 'doom', "''", 'hope', 'mayb', 'get', 'suck', 'rabbit', 'hole', 'ludicr', 'conspiraci', ',', 'stupid', '``', 'psychic', "''", 'test', 'new', 'age', 'b.s.', ',', 'someth', 'could', 'even', 'laugh', 'road', '.', ',', 'end', 'read', 'sens', 'doom', 'indic', 'variou', 'health', 'ailment', ';', 'one', 'prone', '..', 'top', '``', 'doom', "''", 'gloom', '..', "f'n", 'worri', 'heart', '.', 'happen', 'physic', '48', 'hour', '.'], ['hey', 'r/assist', ',', 'sure', 'right', 'place', 'post', '..', 'goe', '=', ')', "'m", 'current', 'student', 'intern', 'sandia', 'nation', 'lab', 'work', 'survey', 'help', 'improv', 'market', 'outreach', 'effort', 'mani', 'school', 'recruit', 'around', 'countri', '.', "'re", 'look', 'current', 'undergrad/grad', 'stem', 'student', "'re", 'stem', 'student', 'know', 'stem', 'student', ',', 'would', 'greatli', 'appreci',

##normalizing

###removing punctuation

In [None]:
def remove_punctuation(data):
  new_data = []
  for words in data:
    new_words = [word for word in words if word.isalpha()]
    new_data.append(new_words)
  return new_data


In [None]:
new_data = remove_punctuation(new_data)

In [None]:
print(new_data)

[['said', 'felt', 'way', 'sugget', 'go', 'rest', 'trigger', 'ahead', 'youi', 'hypocondriac', 'like', 'decid', 'look', 'feel', 'doom', 'hope', 'mayb', 'get', 'suck', 'rabbit', 'hole', 'ludicr', 'conspiraci', 'stupid', 'psychic', 'test', 'new', 'age', 'someth', 'could', 'even', 'laugh', 'road', 'end', 'read', 'sens', 'doom', 'indic', 'variou', 'health', 'ailment', 'one', 'prone', 'top', 'doom', 'gloom', 'worri', 'heart', 'happen', 'physic', 'hour'], ['hey', 'sure', 'right', 'place', 'post', 'goe', 'current', 'student', 'intern', 'sandia', 'nation', 'lab', 'work', 'survey', 'help', 'improv', 'market', 'outreach', 'effort', 'mani', 'school', 'recruit', 'around', 'countri', 'look', 'current', 'stem', 'student', 'stem', 'student', 'know', 'stem', 'student', 'would', 'greatli', 'appreci', 'help', 'take', 'pass', 'along', 'short', 'survey', 'thank', 'everyon', 'help', 'take', 'survey', 'enter', 'draw', 'chanc', 'win', 'one', 'three', 'amazon', 'gc'], ['mom', 'hit', 'newspap', 'shock', 'would',

###all lowercase

In [None]:
def lowercase(data):
  new_data = []
  for words in data:
    new_words = [w.lower() for w in words]
    new_data.append(new_words)
  return new_data

In [None]:
new_data = lowercase(new_data)

In [None]:
new_data2 = np.asarray(new_data)

  new_data2 = np.asarray(new_data)


##label encoding

In [None]:
def label_encode(data):
  new_data = []
  model = LabelEncoder()
  for words in data:
    new_words = model.fit_transform(words)
    new_data.append(new_words)
  return new_data

In [None]:
new_data2 = label_encode(new_data)

In [None]:
new_data = np.asarray(new_data2)

  new_data = np.asarray(new_data2)


##one hot encoding

In [None]:
def one_hot_encode(data):
  new_data = []
  for words in data:
    model = OneHotEncoder(sparse = False)
    words = words.reshape(len(words), 1)
    new_words = model.fit_transform(words)
    new_data.append(new_words)
  
  return new_data

In [None]:
new_data3 = one_hot_encode(new_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


#Preprocessing 

##Padding

In [None]:
data = get_data("data.pkl")

finding max length to know how much padding

In [None]:
def get_max(array):
  max = array[0].size
  for a in array:
    if a.size > max:
      max = a.size
  return max

In [None]:
max = get_max(data)

In [None]:
print(max)

16215


In [None]:
def pad_array(array, max):
  new_array = []
  for a in array:
    print(a.size)
    for i in range(a.size, max):
      a = np.append(a, 0)
    print(a.size)
    print(max) 
    new_array.append(a)
  return new_array

In [None]:
pad = pad_array(data, max)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215
16215

In [None]:
def print_length(data, max):
  for a in data: 
    if a.size != max:
      print(a.size)

In [None]:
print(data[438].size)

16215


In [None]:
print(data.shape)

(2838, 16215)


In [None]:
print_length(data, max)

In [None]:
for i1 in range(data.size - 1):
  for i2 in range(data[i1].size - 1):
    if np.isnan(data[i1][i2]) == True:
      print("i1: " + str(i1))
      print("i2: " + str(i2))

In [None]:
print(data.shape)


(2838,)


In [None]:
print(type(data))

<class 'numpy.ndarray'>


In [None]:
def reconstruct(array): 
  new_array = []
  for a in array:
    new_array.append(a)
  new_array = np.asarray(new_array)
  return new_array

In [None]:
new_data = reconstruct(data)

In [None]:
print(new_data.shape)

(2838, 16215)


In [None]:
save_data(new_data, "data.pkl")

##checking for strings

In [None]:
dim1, dim2 = new_data.shape

In [None]:
for i_array in range(dim1): 
  for i_val in range(dim2):
    val = new_data[i_array][i_val]
    if type(val) == type(""):
      print(val)
      print(i_array)
      print(i_val)

#Classification models

##Logistic Regression

In [None]:
def logistic_regression(x_train, y_train, x_test, y_test):
  model = LogisticRegression()
  model.fit(x_train, y_train)
  y_preds = model.predict(x_test)
  return model.score(x_test, y_test)

In [None]:
split = 2000
end = 2837

In [None]:
score = logistic_regression(data[:split], labels[:split], data[split:end], labels[split:end])

In [None]:
print(score)

0.4838709677419355


##LSTMs

##RNN

In [None]:
def rnn(x_train, y_train, x_test, y_test, num_epochs):
  dim1, dim2 = x_train.shape
  model = keras.Sequential()
  
  model.add(SimpleRNN(128, return_sequences = True, activation = "tanh", input_shape = (dim2, 1)))
  #latest addition: adding dropout layers
  #model.add(Dropout(0.2))
  model.add(SimpleRNN(16, activation = 'tanh', input_shape = (128,)))
  #model.add(Dropout(0.2))
  model.add(Dense(22, kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=regularizers.L2(1e-4), activity_regularizer=regularizers.L2(1e-5), activation = 'relu'))
  
  model.add(Dense(1, activation = 'sigmoid')) # change activation to sigmoid to keep values close to either 0 or 1 for binary classification
  model.add(Flatten())
  model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = [keras.metrics.BinaryAccuracy(), keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.FalsePositives(), keras.metrics.FalseNegatives(), keras.metrics.TruePositives(), keras.metrics.TrueNegatives()]) # change loss function to BCE loss (binary crossentropy loss) to match with sigmoid
  #fitting model to training data and validating with test data
  model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = num_epochs)

In [None]:
rnn(data[:split], labels[:split], data[split:end], labels[split:end], 30)

Epoch 1/30
 6/63 [=>............................] - ETA: 1:15:04 - loss: 0.7154 - binary_accuracy: 0.4792 - precision_6: 0.5040 - recall_6: 0.6238 - false_positives_6: 62.0000 - false_negatives_6: 38.0000 - true_positives_6: 63.0000 - true_negatives_6: 29.0000

##CNN


In [None]:
def cnn(x_train, y_train, x_test, y_test, num_epochs, user_x):
  dim1, dim2 = x_train.shape
  model = keras.Sequential()
  model.add(Dense(10, activation = 'relu', input_shape = (dim2, )))
  model.add(Dropout(0.3))
  model.add(Dense(20, kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4), bias_regularizer=regularizers.L2(1e-4), activity_regularizer=regularizers.L2(1e-5), activation = 'relu'))
  model.add(Dropout(0.1))
  model.add(Dense(1, activation = 'sigmoid')) # change activation to sigmoid to keep values close to either 0 or 1 for binary classification
  model.add(Flatten())
  model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = [keras.metrics.BinaryAccuracy(), keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.FalsePositives(), keras.metrics.FalseNegatives(), keras.metrics.TruePositives(), keras.metrics.TrueNegatives()]) # change loss function to BCE loss (binary crossentropy loss) to match with sigmoid
  #fitting model to training data and validating with test data
  model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = num_epochs)
  user_x = np.asarray(user_x).astype('float32')
  user_x = user_x.reshape(len(user_x), 1)
  pred = model.predict(user_x)
  return pred
  
  

In [None]:
(cnn(data[:split], labels[:split], data[split:end], labels[split:end], 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


#UI

In [None]:
user_in = input("Enter text: ")

Enter text: my name is bliss!


tokenizing

In [None]:
user_in = word_tokenize(user_in)

In [None]:
print(user_in)

['my', 'name', 'is', 'bliss', '!']


removing stopwords

In [None]:
filtered = []
for word in user_in: 
  if not word in stopwords:
    filtered.append(word)

In [None]:
print(filtered)

['name', 'bliss', '!']


stemming

In [None]:
stemmer = PorterStemmer()
user_in = []
for word in filtered:
  user_in.append(stemmer.stem(word))

In [None]:
print(user_in)

['name', 'bliss', '!']


removing punctuation and making it lowercase

In [None]:
new_user = []
for word in user_in:
  if word.isalpha():
    new_user.append(word.lower())

In [None]:
print(new_user)

['name', 'bliss']


label encoding

In [None]:
model = LabelEncoder()
user_in = model.fit_transform(user_in)

In [None]:
print(user_in)

[2 1 0]


one hot encoding

In [None]:
model = OneHotEncoder(sparse = False)
user_in = np.asarray(user_in)
user_in = user_in.reshape(len(user_in), 1)
user_in = model.fit_transform(user_in)



In [None]:
print(user_in.size)

9


In [None]:
print(len(user_in))

3


In [None]:
while len(user_in) < 16215:
  user_in.append(0)

Running the model and getting the prediction

In [None]:
pred = cnn(data[:split], labels[:split], data[split:end], labels[split:end], 30, user_in)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  user_x = np.asarray(user_x).astype('float32')


ValueError: ignored