In [11]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
sao = pd.read_csv("sao_rubrics.csv")
ca = pd.read_csv("ca_rubrics.csv")
gas = pd.read_csv("gas_rubrics.csv")
cor = pd.read_csv("cor_rubrics.csv")

In [13]:
y_sao = sao['word_count_score']
X_sao = sao[['esai', 'word_count']]

y_ca = ca['noun_count_score']
X_ca = ca[['esai', 'noun_count']]

y_gas = gas['spell_error_count_final_score']
X_gas = gas[['esai', 'spell_error_count']]

y_cor = cor['final_score'].iloc[:12934]
X_cor = cor[['essay']].iloc[:12934]

In [14]:
print(y_sao.shape)
print(y_ca.shape)
print(y_gas.shape)
print(y_cor.shape)

(12934,)
(12934,)
(12934,)
(12934,)


In [15]:
X_sao_train, X_sao_test, X_ca_train, X_ca_test, X_gas_train, X_gas_test, X_cor_train, X_cor_test, y_sao_train, y_sao_test, y_ca_train, y_ca_test, y_gas_train, y_gas_test, y_cor_train, y_cor_test = train_test_split(X_sao, X_ca, X_gas, X_cor, y_sao, y_ca, y_gas, y_cor, test_size=0.3, random_state=42)

In [16]:
train_e_sao = X_sao_train['esai'].tolist()
test_e_sao = X_sao_test['esai'].tolist()

train_e_ca = X_ca_train['esai'].tolist()
test_e_ca = X_ca_test['esai'].tolist()

train_e_gas = X_gas_train['esai'].tolist()
test_e_gas = X_gas_test['esai'].tolist()

train_e_cor = X_cor_train['essay'].tolist()
test_e_cor = X_cor_test['essay'].tolist()

In [17]:
stop_words = set(stopwords.words('english'))
def sent2word(x):
    x=re.sub("[^A-Za-z]"," ",x)
    x.lower()
    filtered_sentence = []
    words=x.split()
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words

In [18]:
# Processing 'esai' for sao task
train_sents_sao = []
test_sents_sao = []

for i in train_e_sao:
    train_sents_sao += essay2word(i)

for i in test_e_sao:
    test_sents_sao += essay2word(i)

# Processing 'esai' for ca task
train_sents_ca = []
test_sents_ca = []

for i in train_e_ca:
    train_sents_ca += essay2word(i)

for i in test_e_ca:
    test_sents_ca += essay2word(i)

# Processing 'esai' for gas task
train_sents_gas = []
test_sents_gas = []

for i in train_e_gas:
    train_sents_gas += essay2word(i)

for i in test_e_gas:
    test_sents_gas += essay2word(i)

# Processing 'essay' for cor task
train_sents_cor = []
test_sents_cor = []

for i in train_e_cor:
    train_sents_cor += essay2word(i)

for i in test_e_cor:
    test_sents_cor += essay2word(i)

In [19]:
def get_model():
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=(1, 300), return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='relu'))  # Change this to match the number of your outputs (4 in this case)
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model

In [22]:
# Training Word2Vec model
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

model = Word2Vec(
    train_sents_sao + train_sents_ca + train_sents_gas + train_sents_cor,
    workers=num_workers,
    vector_size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling
)

model.init_sims(replace=True)
model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)

  model.init_sims(replace=True)


In [23]:
def makeVec(words, model, num_features):
    vec = np.zeros((num_features,), dtype="float32")
    noOfWords = 0.
    index_to_key_set = set(model.wv.index_to_key)
    for i in words:
        if i in index_to_key_set:
            noOfWords += 1
            vec = np.add(vec, model.wv[i])  # Use model.wv[i] instead of model[i]
    vec = np.divide(vec, noOfWords)
    return vec

def getVecs(essays, model, num_features):
    c = 0
    essay_vecs = np.zeros((len(essays), num_features), dtype="float32")
    for i in essays:
        essay_vecs[c] = makeVec(i, model, num_features)
        c += 1
    return essay_vecs

In [24]:
# For sao task
clean_train_sao = []
for i in train_e_sao:
    clean_train_sao.append(sent2word(i))
training_vectors_sao = getVecs(clean_train_sao, model, num_features)

clean_test_sao = []
for i in test_e_sao:
    clean_test_sao.append(sent2word(i))
testing_vectors_sao = getVecs(clean_test_sao, model, num_features)

# For ca task
clean_train_ca = []
for i in train_e_ca:
    clean_train_ca.append(sent2word(i))
training_vectors_ca = getVecs(clean_train_ca, model, num_features)

clean_test_ca = []
for i in test_e_ca:
    clean_test_ca.append(sent2word(i))
testing_vectors_ca = getVecs(clean_test_ca, model, num_features)

In [25]:
# For gas task
clean_train_gas = []
for i in train_e_gas:
    clean_train_gas.append(sent2word(i))
training_vectors_gas = getVecs(clean_train_gas, model, num_features)

clean_test_gas = []
for i in test_e_gas:
    clean_test_gas.append(sent2word(i))
testing_vectors_gas = getVecs(clean_test_gas, model, num_features)

# For cor task
clean_train_cor = []
for i in train_e_cor:
    clean_train_cor.append(sent2word(i))
training_vectors_cor = getVecs(clean_train_cor, model, num_features)

clean_test_cor = []
for i in test_e_cor:
    clean_test_cor.append(sent2word(i))
testing_vectors_cor = getVecs(clean_test_cor, model, num_features)

  vec = np.divide(vec, noOfWords)


In [27]:
# Reshaping vectors for sao task
training_vectors_sao = np.array(training_vectors_sao)
testing_vectors_sao = np.array(testing_vectors_sao)

training_vectors_sao = np.reshape(training_vectors_sao, (training_vectors_sao.shape[0], 1, training_vectors_sao.shape[1]))
testing_vectors_sao = np.reshape(testing_vectors_sao, (testing_vectors_sao.shape[0], 1, testing_vectors_sao.shape[1]))

In [28]:
# Reshaping vectors for ca task
training_vectors_ca = np.array(training_vectors_ca)
testing_vectors_ca = np.array(testing_vectors_ca)

training_vectors_ca = np.reshape(training_vectors_ca, (training_vectors_ca.shape[0], 1, training_vectors_ca.shape[1]))
testing_vectors_ca = np.reshape(testing_vectors_ca, (testing_vectors_ca.shape[0], 1, testing_vectors_ca.shape[1]))

In [29]:
training_vectors_gas = np.array(training_vectors_gas)
testing_vectors_gas = np.array(testing_vectors_gas)

training_vectors_gas = np.reshape(training_vectors_gas, (training_vectors_gas.shape[0], 1, training_vectors_gas.shape[1]))
testing_vectors_gas = np.reshape(testing_vectors_gas, (testing_vectors_gas.shape[0], 1, testing_vectors_gas.shape[1]))

In [30]:
# Reshaping vectors for cor task
training_vectors_cor = np.array(training_vectors_cor)
testing_vectors_cor = np.array(testing_vectors_cor)

training_vectors_cor = np.reshape(training_vectors_cor, (training_vectors_cor.shape[0], 1, training_vectors_cor.shape[1]))
testing_vectors_cor = np.reshape(testing_vectors_cor, (testing_vectors_cor.shape[0], 1, testing_vectors_cor.shape[1]))

In [31]:
from keras.layers import Input, Concatenate
from keras.models import Model

def get_multi_model():
    input_sao = Input(shape=(1, 300))
    input_ca = Input(shape=(1, 300))
    input_gas = Input(shape=(1, 300))
    input_cor = Input(shape=(1, 300))

    lstm_shared = LSTM(300, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)

    lstm_sao = lstm_shared(input_sao)
    lstm_ca = lstm_shared(input_ca)
    lstm_gas = lstm_shared(input_gas)
    lstm_cor = lstm_shared(input_cor)

    merged = Concatenate(axis=-1)([lstm_sao, lstm_ca, lstm_gas, lstm_cor])

    lstm_final = LSTM(64, recurrent_dropout=0.4)(merged)
    dropout = Dropout(0.5)(lstm_final)

    # Adjust the number of units in the Dense layers based on your specific requirements
    output_sao = Dense(1, activation='relu', name='output_sao')(dropout)
    output_ca = Dense(1, activation='relu', name='output_ca')(dropout)
    output_gas = Dense(1, activation='relu', name='output_gas')(dropout)
    output_cor = Dense(1, activation='relu', name='output_cor')(dropout)

    model = Model(inputs=[input_sao, input_ca, input_gas, input_cor], outputs=[output_sao, output_ca, output_gas, output_cor])
    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()
    return model


In [32]:
lstm_model = get_multi_model()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1, 300)]             0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1, 300)]             0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 1, 300)]             0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 1, 300)]             0         []                            
                                                                                              

In [None]:
lstm_model.fit([training_vectors_sao, training_vectors_ca, training_vectors_gas, training_vectors_cor],
                            [y_sao_train, y_ca_train, y_gas_train, y_cor_train],
                            batch_size=64, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
lstm_model.save('aes_model.h5')

In [35]:
# Assuming you have already loaded the trained model (multi_input_lstm_model)
# and the necessary preprocessing functions (sent2word, makeVec, etc.)

# Input for testing
user_input = input("Enter an essay to test the model: ")

# Preprocess the user input
user_input_words = sent2word(user_input)
user_input_vec = makeVec(user_input_words, model, num_features)

# Reshape the input vector to match the model input shape
user_input_vec = np.reshape(user_input_vec, (1, 1, num_features))

# Make predictions using the multi_input_lstm_model
predicted_scores = lstm_model.predict([user_input_vec, user_input_vec, user_input_vec, user_input_vec])

# Display the predicted scores
print("Predicted scores:")
print(f"Task 1 (y_sao): {predicted_scores[0][0]}")
print(f"Task 2 (y_ca): {predicted_scores[1][0]}")
print(f"Task 3 (y_gas): {predicted_scores[2][0]}")
print(f"Task 4 (y_cor): {predicted_scores[3][0]}")

Enter an essay to test the model: Surat kabar lokal yang terhormat, menurut saya pengaruh komputer terhadap manusia adalah keterampilan/pengaruh belajar yang hebat karena komputer memberi kita waktu untuk ngobrol dengan teman/orang baru, membantu kita belajar tentang dunia (astronomi) dan menjauhkan kita dari masalah! Pikirkan tentang! Bukankah begitu? Bagaimana perasaan Anda jika anak remaja Anda selalu menelepon teman-temannya! Pernahkah Anda ngobrol dengan teman atau mitra bisnis Anda tentang berbagai hal. Nah sekarang - ada cara baru untuk ngobrol di komputer, ada banyak situs di internet untuk melakukannya: @ ORGANIZATION1, @ ORGANIZATION2, @ CAPS1, facebook, myspace dll. Bayangkan saja saat Anda mengatur pertemuan dengan atasan Anda di komputer, anak remaja Anda sedang bersenang-senang di telepon dan tidak terburu-buru menutup telepon karena Anda ingin menggunakannya. Bagaimana Anda mengetahui tentang negara/negara bagian lain di luar negara Anda? Ya, saya menggunakan komputer/in