## NLP Lap 3 

### The main purpose behind this lab is to get familiar with NLP language models using Sklearn library.


Importing all the necessary librarys for the part number 1 

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
import nltk

Downloading the packages "punkt" "stopwords" "wordnet" using nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Loading our dataset from the given url 

In [3]:
url = 'https://github.com/dbbrandt/short_answer_granding_capstone_project/raw/master/data/sag/answers.csv'
df = pd.read_csv(url)

In [4]:
df

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0
...,...,...,...,...
2437,12.1,log n,5.0,1.0
2438,12.1,minus 1 divided by 2,1.5,0.0
2439,12.1,2n-1,2.5,0.0
2440,12.1,"it takes at most h steps, where h is the heigh...",5.0,1.0


In [5]:
df.shape

(2442, 4)

In [6]:
df.columns

Index(['id', 'answer', 'score', 'correct'], dtype='object')

In [7]:
df.head(5)

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0


Moving to the text preprocessing 

In [8]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to the dataset
df['processed_text'] = df['answer'].apply(preprocess_text)

In [9]:
df.head(5)

Unnamed: 0,id,answer,score,correct,processed_text
0,1.1,High risk problems are address in the prototyp...,3.5,0.0,high risk problem address prototyp program mak...
1,1.1,To simulate portions of the desired final prod...,5.0,1.0,simul portion desir final product quick easi p...
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0,prototyp program simul behavior portion desir ...
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0,defin specif phase prototyp stimul behavior po...
4,1.1,It is used to let the users have a first idea ...,3.0,0.0,use let user first idea complet program allow ...


Encoding Data Vectors

In [10]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['processed_text']).toarray()

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text']).toarray()

# Word2Vec
tokenized_text = df['processed_text'].apply(word_tokenize)
word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=0)  # CBOW
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
                                or [np.zeros(100)], axis=0) for words in tokenized_text])

Splitting the data 

In [11]:
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, df['score'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, df['score'], test_size=0.2, random_state=42)
X_train_w2v, X_test_w2v, _, _ = train_test_split(X_word2vec, df['score'], test_size=0.2, random_state=42)

training and evaluating the models

In [12]:
models = {
    'SVR': SVR(),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor()
}

def evaluate_model(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    return mse, rmse, r2

In [13]:
results = {}
for name, model in models.items():
    mse_bow, rmse_bow, r2_bow = evaluate_model(X_train_bow, X_test_bow, y_train, y_test, model)
    mse_tfidf, rmse_tfidf, r2_tfidf = evaluate_model(X_train_tfidf, X_test_tfidf, y_train, y_test, model)
    mse_w2v, rmse_w2v, r2_w2v = evaluate_model(X_train_w2v, X_test_w2v, y_train, y_test, model)
    
    results[name] = {
        'BOW': {'MSE': mse_bow, 'RMSE': rmse_bow, 'R2': r2_bow},
        'TFIDF': {'MSE': mse_tfidf, 'RMSE': rmse_tfidf, 'R2': r2_tfidf},
        'Word2Vec': {'MSE': mse_w2v, 'RMSE': rmse_w2v, 'R2': r2_w2v}
    }


In [15]:
for model_name, metrics in results.items():
    print(f"Results for {model_name}:")
    for vector_type, scores in metrics.items():
        print(f"  {vector_type}:")
        for metric_name, score in scores.items():
            print(f"    {metric_name}: {score:.4f}")

Results for SVR:
  BOW:
    MSE: 1.0897
    RMSE: 1.0439
    R2: 0.1480
  TFIDF:
    MSE: 0.9442
    RMSE: 0.9717
    R2: 0.2618
  Word2Vec:
    MSE: 1.6770
    RMSE: 1.2950
    R2: -0.3111
Results for Linear Regression:
  BOW:
    MSE: 83975996787684403480363008.0000
    RMSE: 9163841813763.7227
    R2: -65655209789769237286354944.0000
  TFIDF:
    MSE: 1729860309208751639954980864.0000
    RMSE: 41591589404695.1719
    R2: -1352461963568523537392074752.0000
  Word2Vec:
    MSE: 1.1718
    RMSE: 1.0825
    R2: 0.0838
Results for Decision Tree:
  BOW:
    MSE: 1.5751
    RMSE: 1.2550
    R2: -0.2315
  TFIDF:
    MSE: 1.7036
    RMSE: 1.3052
    R2: -0.3319
  Word2Vec:
    MSE: 1.7653
    RMSE: 1.3286
    R2: -0.3802


Choosing the right Model

In [17]:
best_model = None
best_score = float('inf')
best_vector = None

for model_name, metrics in results.items():
    for vector_type, scores in metrics.items():
        if scores['RMSE'] < best_score:
            best_score = scores['RMSE']
            best_model = model_name
            best_vector = vector_type

print(f"\nBest Model: {best_model} with {best_vector} vectorization (RMSE: {best_score:.4f})")


Best Model: SVR with TFIDF vectorization (RMSE: 0.9717)
