# Part1: Language Modeling / Regression

In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ayoubbakkali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Load Dataset

In [2]:
dataset = pd.read_csv("https://raw.githubusercontent.com/dbbrandt/short_answer_granding_capstone_project/master/data/sag/answers.csv")
dataset = dataset.drop(["correct", "id"], axis=1)
dataset.head()

Unnamed: 0,answer,score
0,High risk problems are address in the prototyp...,3.5
1,To simulate portions of the desired final prod...,5.0
2,A prototype program simulates the behaviors of...,4.0
3,Defined in the Specification phase a prototype...,5.0
4,It is used to let the users have a first idea ...,3.0


## 2. NLP pipeline

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [4]:
dataset["tokens"] = dataset["answer"].apply(word_tokenize)
dataset["tokens"].head()

0    [High, risk, problems, are, address, in, the, ...
1    [To, simulate, portions, of, the, desired, fin...
2    [A, prototype, program, simulates, the, behavi...
3    [Defined, in, the, Specification, phase, a, pr...
4    [It, is, used, to, let, the, users, have, a, f...
Name: tokens, dtype: object

In [5]:
dataset["tokens"] = dataset["tokens"].apply(lambda x: [word for word in x if word.lower() not in stop_words])
dataset["tokens"]

0       [High, risk, problems, address, prototype, pro...
1       [simulate, portions, desired, final, product, ...
2       [prototype, program, simulates, behaviors, por...
3       [Defined, Specification, phase, prototype, sti...
4       [used, let, users, first, idea, completed, pro...
                              ...                        
2437                                             [log, n]
2438                               [minus, 1, divided, 2]
2439                                               [2n-1]
2440             [takes, h, steps, ,, h, height, tree, .]
2441    [depends, install, search, tree, whatever, cas...
Name: tokens, Length: 2442, dtype: object

In [6]:
lemmatizer = WordNetLemmatizer()

dataset["lemmatization"] = dataset["tokens"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x] )
#dataset["lemmatization"] = dataset["tokens"].apply(lambda x: [word for word in x if word is not "" or word is not " "])
dataset["lemmatization"]

0       [High, risk, problem, address, prototype, prog...
1       [simulate, portion, desired, final, product, q...
2       [prototype, program, simulates, behavior, port...
3       [Defined, Specification, phase, prototype, sti...
4       [used, let, user, first, idea, completed, prog...
                              ...                        
2437                                             [log, n]
2438                               [minus, 1, divided, 2]
2439                                               [2n-1]
2440               [take, h, step, ,, h, height, tree, .]
2441    [depends, install, search, tree, whatever, cas...
Name: lemmatization, Length: 2442, dtype: object

### 3. Encoding data

#### 3.1 Word2Vec (CBOW and Skip-gram)

In [7]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Train Word2Vec models (CBOW and Skip-gram)
cbow_model = Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, sg=0)
skipgram_model = Word2Vec(sentences=dataset['tokens'], vector_size=100, window=5, min_count=1, sg=1)

In [8]:
def get_sentence_embedding(sentence, model):
    # Get vectors for words in the sentence, ignore words not in the model's vocabulary
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:  # If no words in the sentence are in the vocabulary, return a zero vector
        return np.zeros(model.vector_size)
    # Compute the mean of the word vectors
    return np.mean(word_vectors, axis=0)


cbow_vectors = np.array([get_sentence_embedding(sentence, cbow_model) for sentence in dataset['tokens']])
skipgram_vectors = np.array([get_sentence_embedding(sentence, skipgram_model) for sentence in dataset['tokens']])


print(len(cbow_vectors))
print(len(skipgram_vectors))

2442
2442


#### 3.2 BagOfWords

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
bow_matrix = vectorizer.fit_transform(dataset['answer'])
bow_matrix.shape

(2442, 2620)

#### 3.3 TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['answer'])

tfidf_matrix

<2442x2620 sparse matrix of type '<class 'numpy.float64'>'
	with 34175 stored elements in Compressed Sparse Row format>

### 4. Model Training (Word2vect embedding)

In [16]:
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [17]:
X, y = np.array(cbow_vectors),np.array(dataset["score"])
print(X)
print(y)
print(y.shape)
print(X.shape)

[[-0.07440589  0.08031797 -0.03426092 ... -0.38296026 -0.00309836
   0.07928286]
 [-0.06286704  0.06985551 -0.02598914 ... -0.33443388 -0.00218877
   0.06549644]
 [-0.06556115  0.07073511 -0.02641777 ... -0.33607349 -0.00262122
   0.06618875]
 ...
 [-0.00713767 -0.00135911  0.00883542 ...  0.00480326 -0.00664322
  -0.00177323]
 [-0.09801418  0.11211058 -0.04483525 ... -0.49658155 -0.00109324
   0.10569453]
 [-0.07336561  0.08024236 -0.03486623 ... -0.3745698   0.00202734
   0.07118678]]
[3.5 5.  4.  ... 2.5 5.  1.5]
(2442,)
(2442, 100)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y , test_size=0.2 , random_state=42)

In [19]:
from sklearn.metrics import accuracy_score

# SVR Model
svr_model = SVR()
svr_model.fit(X_train, y_train)
svr_predictions = svr_model.predict(X_test)
svr_mse = mean_squared_error(y_test, svr_predictions)
svr_rmse = np.sqrt(svr_mse)
print("SVR MSE:", svr_mse)
print("SVR RMSE:", svr_rmse)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_rmse = np.sqrt(lr_mse)
print("Linear Regression MSE:", lr_mse)
print("Linear Regression RMSE:", lr_rmse)

# Decision Tree Regressor model
dt_regressor_model = DecisionTreeRegressor()
dt_regressor_model.fit(X_train, y_train)
dt_regressor_predictions = dt_regressor_model.predict(X_test)
dt_regressor_mse = mean_squared_error(y_test, dt_regressor_predictions)
dt_regressor_rmse = np.sqrt(dt_regressor_mse)
print("Decision Tree Regressor MSE:", dt_regressor_mse)
print("Decision Tree Regressor RMSE:", dt_regressor_rmse)

# Calculate accuracy for each model
# Note: Accuracy is not a typical metric for regression tasks
svr_accuracy = svr_model.score(X_test, y_test)
lr_accuracy = lr_model.score(X_test, y_test)
dt_regressor_accuracy = dt_regressor_model.score(X_test, y_test)

SVR MSE: 1.0104627475833199
SVR RMSE: 1.005217761275297
Linear Regression MSE: 105.78760965898223
Linear Regression RMSE: 10.285310382238459
Decision Tree Regressor MSE: 1.8832834867075663
Decision Tree Regressor RMSE: 1.3723277621281171


### 5. Conclusion

After evaluating the performances of Support Vector Regressor (SVR), Linear Regression, and Decision Tree models, several important conclusions can be drawn. Firstly, in terms of average prediction accuracy, SVR exhibits the lowest Mean Squared Error (MSE) among the three models, closely followed by the Decision Tree. This suggests that both SVR and Decision Tree models better fit the data compared to Linear Regression, which has a significantly higher MSE. When examining the dispersion of prediction errors, measured by Root Mean Squared Error (RMSE), SVR and Decision Tree show relatively similar values, indicating similar dispersion of errors around the mean value. In contrast, Linear Regression demonstrates a notably higher RMSE, implying greater variability in prediction errors. Consequently, the choice of model depends on the specific project objectives, performance requirements, and data characteristics. In this case, although SVR and Decision Tree models exhibit relatively similar performances, the final model selection should be based on comprehensive evaluation considering various aspects including accuracy, complexity, and interpretability.