In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./Data/train.csv')

In [3]:
f'Is there any unique values in the column. If true then there are no Unique elements: {data.discourse_id.nunique() == data.shape[0]}'

'Is there any unique values in the column. If true then there are no Unique elements: True'

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36765 non-null  object
 1   essay_id                 36765 non-null  object
 2   discourse_text           36765 non-null  object
 3   discourse_type           36765 non-null  object
 4   discourse_effectiveness  36765 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [5]:
data.discourse_id = pd.Categorical(data.discourse_id)

### Preprocessing the target

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data.discourse_effectiveness)

In [7]:
data.discourse_effectiveness = le.transform(data.discourse_effectiveness)

### Subsetting the dataset

In [8]:
df = data[['discourse_id','discourse_text','discourse_effectiveness']]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   discourse_id             36765 non-null  category
 1   discourse_text           36765 non-null  object  
 2   discourse_effectiveness  36765 non-null  int64   
dtypes: category(1), int64(1), object(1)
memory usage: 2.0+ MB


In [10]:
mapping = dict(zip(range(len(le.classes_)), le.classes_)) # Mapping the classes to the numbers

### Text preprocessing the text feature

In [11]:
from nltk.corpus import stopwords
import re,string
from nltk.tokenize import word_tokenize

In [12]:
stopWrds = stopwords.words('english')
punctiation = set(string.punctuation)

In [13]:
stopWordsRemove = data['discourse_text'].tolist()

In [14]:
stopWordsRemove = [' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", c) for c in i.lower().replace("(","").replace(")","").split(' ') if not c.isnumeric()) for i in stopWordsRemove]

In [15]:
stopWordsRemove = [' '.join(["".join(j) for j in word_tokenize(i.lower()) if j not in stopWrds]) for i in stopWordsRemove]

In [16]:
stopWordsRemove = [i for i in stopWordsRemove if i not in punctiation]

In [17]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [18]:
stopWordsRemove = [stemmer.stem(i) for i in stopWordsRemove]

In [19]:
df['discourse_text'] = stopWordsRemove

In [20]:
df

Unnamed: 0,discourse_id,discourse_text,discourse_effectiveness
0,0013cc385424,hi im isaac im going writing face mars natural...,0
1,9704a709b505,perspective think face natural landform dont t...,0
2,c22adee811b6,think face natural landform life mars descover...,0
3,a10d361e54e4,life mars would know reason think natural land...,0
4,db3e453ec4e2,people thought face formed alieans thought lif...,0
...,...,...,...
36760,9f63b687e76a,many people dont like asking one person advice...,0
36761,9d5bd7d86212,also people different views opinions th,0
36762,f1b78becd573,advice something impact persons view many ways...,0
36763,cc184624ca8e,someone use everything many people said use he...,2


### Preprocessing the text column

In [28]:
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7, stop_words='english')

In [34]:
feature = vectorizer.fit_transform(df['discourse_text'])

### LSTM Model creation

In [41]:
from keras.layers import LSTM, Embedding, Dense, Flatten
from keras.models import Sequential
import tensorflow as tf

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train,X_test,y_train,y_test = train_test_split(feature,df['discourse_effectiveness'],test_size=0.3,shuffle=True, random_state=34)

In [45]:
X_tr = tf.keras.preprocessing.sequence.pad_sequences(X_train.todense(),maxlen=120)
X_tst = tf.keras.preprocessing.sequence.pad_sequences(X_test.todense(),maxlen=120)

In [55]:
model = Sequential()
model.add(Embedding(input_dim=6000, output_dim=32, input_length=1000))
model.add(LSTM(units=20,return_sequences=True))
model.add(LSTM(units=16,return_sequences=False))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1000, 32)          192000    
                                                                 
 lstm_6 (LSTM)               (None, 1000, 20)          4240      
                                                                 
 lstm_7 (LSTM)               (None, 16)                2368      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 198,625
Trainable params: 198,625
Non-trainable params: 0
_________________________________________________________________


In [58]:
hist = model.fit(X_train.todense(), y_train, epochs=10, batch_size=64,validation_data=(X_test.todense(),y_test))

Epoch 1/10
Epoch 2/10
 10/403 [..............................] - ETA: 3:16 - loss: 0.0000e+00 - acc: 0.2359

KeyboardInterrupt: 

In [59]:
mapping

{0: 'Adequate', 1: 'Effective', 2: 'Ineffective'}