In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
url = "https://raw.githubusercontent.com/dbbrandt/short_answer_granding_capstone_project/master/data/sag/answers.csv"
data = pd.read_csv(url)

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize and Stem
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    stemmed = [stemmer.stem(word) for word in lemmatized]
    
    return ' '.join(stemmed)

data['processed_text'] = data['answer'].apply(preprocess)
data.head()


[nltk_data] Downloading package punkt to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,answer,score,correct,processed_text
0,1.1,High risk problems are address in the prototyp...,3.5,0.0,high risk problem address prototyp program mak...
1,1.1,To simulate portions of the desired final prod...,5.0,1.0,simul portion desir final product quick easi p...
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0,prototyp program simul behavior portion desir ...
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0,defin specif phase prototyp stimul behavior po...
4,1.1,It is used to let the users have a first idea ...,3.0,0.0,use let user first idea complet program allow ...


In [2]:
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenize the processed text
tokenized_text = data['processed_text'].apply(word_tokenize)

# CBOW Model
cbow_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Skip-Gram Model
skipgram_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Get average Word2Vec vectors
def get_word2vec_vector(text, model, vector_size):
    words = text.split()
    vector = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    if isinstance(vector, np.ndarray):
        return vector
    else:
        return np.zeros(vector_size)

vector_size = 100
data['cbow_vector'] = data['processed_text'].apply(lambda x: get_word2vec_vector(x, cbow_model, vector_size))
data['skipgram_vector'] = data['processed_text'].apply(lambda x: get_word2vec_vector(x, skipgram_model, vector_size))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
count_vectorizer = CountVectorizer()
bow_vectors = count_vectorizer.fit_transform(data['processed_text']).toarray()

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(data['processed_text']).toarray()


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['cbow_vector'].tolist(), data['score'], test_size=0.2, random_state=42)

# Convert lists to arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

# Initialize models
svr = SVR()
lr = LinearRegression()
dt = DecisionTreeRegressor()

# Train models
svr.fit(X_train, y_train)
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)

# Predictions
y_pred_svr = svr.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_dt = dt.predict(X_test)

# Calculate metrics
def evaluate(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, rmse, mae

mse_svr, rmse_svr, mae_svr = evaluate(y_test, y_pred_svr)
mse_lr, rmse_lr, mae_lr = evaluate(y_test, y_pred_lr)
mse_dt, rmse_dt, mae_dt = evaluate(y_test, y_pred_dt)

print(f'SVR - MSE: {mse_svr}, RMSE: {rmse_svr}, MAE: {mae_svr}')
print(f'Linear Regression - MSE: {mse_lr}, RMSE: {rmse_lr}, MAE: {mae_lr}')
print(f'Decision Tree - MSE: {mse_dt}, RMSE: {rmse_dt}, MAE: {mae_dt}')


SVR - MSE: 1.7030858972420004, RMSE: 1.3050233320680518, MAE: 0.8798653207308738
Linear Regression - MSE: 1.1781586155580486, RMSE: 1.085430152316605, MAE: 0.8412702334847901
Decision Tree - MSE: 1.9973632002306012, RMSE: 1.4132810054021816, MAE: 1.0026801759930595


In [6]:
best_model = min(
    [('SVR', rmse_svr), ('Linear Regression', rmse_lr), ('Decision Tree', rmse_dt)],
    key=lambda x: x[1]
)
print(f'Best model is {best_model[0]} with RMSE: {best_model[1]}')


Best model is Linear Regression with RMSE: 1.085430152316605
