In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_pickle('datasets/clean_essays.pkl')

In [16]:
df = df.dropna(how = 'all')
df.iloc[:,1][0][0]

['Well',
 ',',
 'right',
 'now',
 'I',
 'just',
 'woke',
 'up',
 'from',
 'a',
 'mid-day',
 'nap',
 '.']

In [22]:
def convert_to_forms(df): 
    
    '''
    Description:
    Shuffles the dataframe and splits the dataframe into training and testing sets
    
    Returns:
    X_train -- Training set of input features
    X_test -- Testing set of input features
    
    Y_train -- Ground truth of training set
    Y_test -- Ground truth of testing set
    
    len_train -- Length of training set
    len_test -- Length of testing set
        
    '''
    
    g = np.arange( len(df) )
    np.random.shuffle(g)

    X = np.asarray((df.iloc[:,1].values)[g])
    X = np.reshape(X,newshape = (X.shape[0],1))
#     print(X.shape,"X in total")
    lengths = np.asarray(df.iloc[:,7],dtype = np.int32)
#     print(lengths,"Lengths in sentence")
    lengths = (np.reshape(lengths,newshape = (lengths.shape[0],1)))[g]   
    print(lengths)
    Y = np.asarray(df.iloc[:,4],dtype = np.int32)
    Y = (np.reshape(Y,newshape = (Y.shape[0],1)))[g]

    X_train = np.reshape(X[0:1984,0],newshape = (1984,1))
    X_test = np.reshape(X[1984:,0], newshape = (483,1))

    Y_train = np.reshape(Y[0:1984,0], newshape = (1984,1))
    Y_test = np.reshape(Y[1984:,0], newshape = (483,1))
    
    len_train = lengths[0:1984,0]
    len_test = lengths[1984:,0]
    
    return X_train,Y_train,len_train,X_test,Y_test,len_test

In [29]:
X_train,Y_train,len_train,X_test,Y_test,len_test = convert_to_forms(df)
len_train

(2467, 1) X in total
[ 35  88  64 ...  18  59 107] Lengths in sentence


array([46, 52, 57, ..., 33, 42, 54], dtype=int32)

In [37]:
def train_test_data(df,emotion = 2):
    '''
        Dividing the dataset into train and test
        input:
        df -- Pickled dataset
        emotion -- Which emotion to be trained on
        
        Returns:
        Xtrain: Training set of i/p features
        Xtest: Testing set of i/p features
        Ytrain: Ground truth for training data
        Ytest: Ground truth for testing data
        len_train: Length of sentences in training data
        len_test: Length of sentences in testing data
    '''
    
    df.dropna(how='all')
    train, test = train_test_split(df, train_size=0.8)
    Xtrain = train.iloc[:,1].values #getting the list of words for Word Vec
    Xtrain = Xtrain.reshape((Xtrain.shape[0],1))
    Xtest = test.iloc[:,1].values
    Xtest = Xtest.reshape((Xtest.shape[0],1))
    Ytrain = train.iloc[:,1].values
    Ytrain = Ytrain.reshape(Ytrain.shape[0],1)
    Ytest = test.iloc[:,emotion].values.astype('int32')
    Ytest = Ytest.reshape(Ytest.shape[0],1)
    len_train = (train.iloc[:,7].values).astype('int32')
    len_test = (test.iloc[:,7].values).astype('int32')
    return Xtrain,Xtest,Ytrain,Ytest,len_train,len_test

array([ 70, 100,  23, ...,  15,  87,  47], dtype=int32)