<h3>Cleaning and Preprocessing</h3>

In [1]:
#import
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import regex as re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#importing CSV file
answers_df = pd.read_csv('answers.csv')
print(answers_df.head())

    id                                             answer  score  correct
0  1.1  High risk problems are address in the prototyp...    3.5      0.0
1  1.1  To simulate portions of the desired final prod...    5.0      1.0
2  1.1  A prototype program simulates the behaviors of...    4.0      1.0
3  1.1  Defined in the Specification phase a prototype...    5.0      1.0
4  1.1  It is used to let the users have a first idea ...    3.0      0.0


In [3]:
#Cleaning the text
def basic_cleaning(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s\-\+\*/!()]', '', text)  # Removing all except alphanumeric, spaces, and specified characters
    text = re.sub(r'\s+', ' ', text).strip()  # Removing extra whitespace
    return text

answers_df['cleaned_text'] = answers_df['answer'].apply(basic_cleaning)
answers_df['cleaned_text'].fillna('', inplace=True)
answers_df.drop('id', axis=1, inplace=True)
answers_df.head()

Unnamed: 0,answer,score,correct,cleaned_text
0,High risk problems are address in the prototyp...,3.5,0.0,high risk problems are address in the prototyp...
1,To simulate portions of the desired final prod...,5.0,1.0,to simulate portions of the desired final prod...
2,A prototype program simulates the behaviors of...,4.0,1.0,a prototype program simulates the behaviors of...
3,Defined in the Specification phase a prototype...,5.0,1.0,defined in the specification phase a prototype...
4,It is used to let the users have a first idea ...,3.0,0.0,it is used to let the users have a first idea ...


In [4]:
def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def preprocess_text(df, text_column):
    df['cleaned_text'] = df[text_column].apply(basic_cleaning)
    df['tokens'] = df['cleaned_text'].apply(tokenize)
    df['tokens'] = df['tokens'].apply(remove_stopwords)
    df['stemmed_tokens'] = df['tokens'].apply(lambda x: ' '.join(stem_tokens(x)))
    df['lemmatized_tokens'] = df['tokens'].apply(lambda x: ' '.join(lemmatize_tokens(x)))
    return df

preprocessed_df = preprocess_text(answers_df, 'answer')
preprocessed_df['lemmatized_tokens'].fillna('', inplace=True)
preprocessed_df.head()

Unnamed: 0,answer,score,correct,cleaned_text,tokens,stemmed_tokens,lemmatized_tokens
0,High risk problems are address in the prototyp...,3.5,0.0,high risk problems are address in the prototyp...,"[high, risk, problems, address, prototype, pro...",high risk problem address prototyp program mak...,high risk problem address prototype program ma...
1,To simulate portions of the desired final prod...,5.0,1.0,to simulate portions of the desired final prod...,"[simulate, portions, desired, final, product, ...",simul portion desir final product quick easi p...,simulate portion desired final product quick e...
2,A prototype program simulates the behaviors of...,4.0,1.0,a prototype program simulates the behaviors of...,"[prototype, program, simulates, behaviors, por...",prototyp program simul behavior portion desir ...,prototype program simulates behavior portion d...
3,Defined in the Specification phase a prototype...,5.0,1.0,defined in the specification phase a prototype...,"[defined, specification, phase, prototype, sti...",defin specif phase prototyp stimul behavior po...,defined specification phase prototype stimulat...
4,It is used to let the users have a first idea ...,3.0,0.0,it is used to let the users have a first idea ...,"[used, let, users, first, idea, completed, pro...",use let user first idea complet program allow ...,used let user first idea completed program all...


In [5]:
#saving the preprocessed data
preprocessed_df.to_csv('preprocessed_answers.csv', index=False)

<h3>Encoding</h3>

In [6]:
#importing the libraries for encoding
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np


In [7]:
#importing the preprocessed data
preprocessed_df = pd.read_csv('preprocessed_answers.csv')

preprocessed_df.head()

Unnamed: 0,answer,score,correct,cleaned_text,tokens,stemmed_tokens,lemmatized_tokens
0,High risk problems are address in the prototyp...,3.5,0.0,high risk problems are address in the prototyp...,"['high', 'risk', 'problems', 'address', 'proto...",high risk problem address prototyp program mak...,high risk problem address prototype program ma...
1,To simulate portions of the desired final prod...,5.0,1.0,to simulate portions of the desired final prod...,"['simulate', 'portions', 'desired', 'final', '...",simul portion desir final product quick easi p...,simulate portion desired final product quick e...
2,A prototype program simulates the behaviors of...,4.0,1.0,a prototype program simulates the behaviors of...,"['prototype', 'program', 'simulates', 'behavio...",prototyp program simul behavior portion desir ...,prototype program simulates behavior portion d...
3,Defined in the Specification phase a prototype...,5.0,1.0,defined in the specification phase a prototype...,"['defined', 'specification', 'phase', 'prototy...",defin specif phase prototyp stimul behavior po...,defined specification phase prototype stimulat...
4,It is used to let the users have a first idea ...,3.0,0.0,it is used to let the users have a first idea ...,"['used', 'let', 'users', 'first', 'idea', 'com...",use let user first idea complet program allow ...,used let user first idea completed program all...


In [8]:
#Bag of words
def encode_bow(corpus):
    vectorizer = CountVectorizer()
    bow_vectors = vectorizer.fit_transform(corpus)
    return bow_vectors, vectorizer
#TF-IDF
def encode_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_vectors = vectorizer.fit_transform(corpus)
    return tfidf_vectors, vectorizer
# Word2Vec (CBOW and Skip-Gram)
def train_word2vec(corpus, vector_size=100, window=5, min_count=1, sg=0):
    tokenized_corpus = [doc.split() for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    return model

def encode_word2vec(model, tokenized_corpus):
    vectors = []
    for tokens in tokenized_corpus:
        vector = sum([model.wv[token] for token in tokens if token in model.wv], start=np.zeros(model.vector_size))
        vectors.append(vector)
    return np.array(vectors)

preprocessed_df['lemmatized_tokens'].fillna('', inplace=True)

# Encode using Bag of Words
bow_vectors, bow_vectorizer = encode_bow(preprocessed_df['lemmatized_tokens'])

# Encode using TF-IDF
tfidf_vectors, tfidf_vectorizer = encode_tfidf(preprocessed_df['lemmatized_tokens'])

# Train Word2Vec models (CBOW and Skip-Gram)
cbow_model = train_word2vec(preprocessed_df['lemmatized_tokens'], sg=0)
skipgram_model = train_word2vec(preprocessed_df['lemmatized_tokens'], sg=1)

# Tokenized corpus for Word2Vec encoding
tokenized_corpus = [doc.split() for doc in preprocessed_df['lemmatized_tokens']]

# Encode using Word2Vec (CBOW)
cbow_vectors = encode_word2vec(cbow_model, tokenized_corpus)

# Encode using Word2Vec (Skip-Gram)
skipgram_vectors = encode_word2vec(skipgram_model, tokenized_corpus)

# Display the shapes of the encoded vectors to confirm
print("BoW Vectors Shape:", bow_vectors.shape)
print("TF-IDF Vectors Shape:", tfidf_vectors.shape)
print("Word2Vec CBOW Vectors Shape:", cbow_vectors.shape)
print("Word2Vec Skip-Gram Vectors Shape:", skipgram_vectors.shape)

BoW Vectors Shape: (2442, 2310)
TF-IDF Vectors Shape: (2442, 2310)
Word2Vec CBOW Vectors Shape: (2442, 100)
Word2Vec Skip-Gram Vectors Shape: (2442, 100)


<h3>Model Training</h3>

In [19]:
#importing libraries
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
def train_svr(X_train, y_train, kernel='rbf', C=1.0, epsilon=0.1):
    svr = SVR(kernel=kernel, C=C, epsilon=epsilon)
    svr.fit(X_train, y_train)
    return svr
def train_naive_bayes(X_train, y_train):
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    return nb
def train_linear_regression(X_train, y_train):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    return lr
def train_decision_tree(X_train, y_train, criterion='squared_error', max_depth=None, min_samples_split=2):
    dt = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split)
    dt.fit(X_train, y_train)
    return dt


In [11]:
# Define the target variable
target_variable = preprocessed_df['score']

# Combine tokenized answers with other features
features = np.hstack([cbow_vectors, preprocessed_df[['correct']].values])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_variable, test_size=0.2, random_state=42)
X_train

array([[-1.83650665,  5.9195097 ,  2.60704492, ...,  0.59542129,
        -1.31302195,  1.        ],
       [-0.74107024,  2.55380285,  1.22403071, ...,  0.26573166,
        -0.56076994,  1.        ],
       [-0.57380119,  1.8953589 ,  0.87129129, ...,  0.19488338,
        -0.41083858,  0.        ],
       ...,
       [-0.59285159,  1.86891423,  0.90108926, ...,  0.25584336,
        -0.39459679,  1.        ],
       [-0.47205477,  1.52280687,  0.72274994, ...,  0.21840364,
        -0.34827985,  1.        ],
       [-1.42292534,  4.62208139,  2.11440265, ...,  0.51868474,
        -1.04890432,  1.        ]])

In [12]:
# Train SVR
svr_model = train_svr(X_train, y_train)
# Train Linear Regression
lr_model = train_linear_regression(X_train, y_train)
# Train Decision Tree
dt_model = train_decision_tree(X_train, y_train)

In [20]:
def evaluate_regression_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    
    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    
    # You can return additional metrics here if needed
    return mse, rmse

In [24]:
#Metrics for SVR Model
svr_mse, svr_rmse = evaluate_regression_model(svr_model, X_test, y_test)
# Print the metrics
print("Mean Squared Error (MSE) for SVR model:", svr_mse)
print("Root Mean Squared Error (RMSE) for SVR model:", svr_rmse)
print('*' *20)

#Metrics for Linear Regression Model
lr_mse, lr_rmse = evaluate_regression_model(lr_model, X_test, y_test)
# Print the metrics
print("Mean Squared Error (MSE) for LR model:", lr_mse)
print("Root Mean Squared Error (RMSE) for LR model:", lr_rmse)
print('*' *20)

#Metrics for Decision Tree Model
dt_mse, dt_rmse = evaluate_regression_model(dt_model, X_test, y_test)
# Print the metrics
print("Mean Squared Error (MSE) for DT model:", dt_mse)
print("Root Mean Squared Error (RMSE) for DT model:", dt_rmse)
print('*' *20)


Mean Squared Error (MSE) for SVR model: 0.7490483015396516
Root Mean Squared Error (RMSE) for SVR model: 0.8654757660036771
********************
Mean Squared Error (MSE) for LR model: 0.2963398185047501
Root Mean Squared Error (RMSE) for LR model: 0.544371030185066
********************
Mean Squared Error (MSE) for DT model: 0.46979907453779063
Root Mean Squared Error (RMSE) for DT model: 0.6854189044210778
********************
