## 0- Importing the necessary libraries:

In [89]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.svm import SVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.preprocessing import LabelEncoder

In [16]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Part 1: Language Modeling / Regression

In [17]:
data = pd.read_csv("answers.csv")

In [18]:
data

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0
...,...,...,...,...
2437,12.1,log n,5.0,1.0
2438,12.1,minus 1 divided by 2,1.5,0.0
2439,12.1,2n-1,2.5,0.0
2440,12.1,"it takes at most h steps, where h is the heigh...",5.0,1.0


### 1. Establish a preprocessing NLP pipeline (Tokenization stemming lemmatization, Stop words,Discretization, etc) off the collected Dataset.

In [19]:
# Preprocessing functions
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Stop words removal
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the dataset
data['processed_answer'] = data['answer'].apply(preprocess)

In [20]:
data

Unnamed: 0,id,answer,score,correct,processed_answer
0,1.1,High risk problems are address in the prototyp...,3.5,0.0,high risk problem address prototyp program mak...
1,1.1,To simulate portions of the desired final prod...,5.0,1.0,simul portion desir final product quick easi p...
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0,prototyp program simul behavior portion desir ...
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0,defin specif phase prototyp stimul behavior po...
4,1.1,It is used to let the users have a first idea ...,3.0,0.0,use let user first idea complet program allow ...
...,...,...,...,...,...
2437,12.1,log n,5.0,1.0,log n
2438,12.1,minus 1 divided by 2,1.5,0.0,minu 1 divid 2
2439,12.1,2n-1,2.5,0.0,2n-1
2440,12.1,"it takes at most h steps, where h is the heigh...",5.0,1.0,"take h step , h height tree ."


### 2. Encode your Data vectors By using Word2vec (CBOW, Skip Gram), Bag Of words, TF-IDF.

In [21]:
# Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(data['processed_answer'])

In [22]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_answer'])

In [23]:
# Word2Vec
tokenized_sentences = [sentence.split() for sentence in data['processed_answer']]
w2v_model_cbow = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)
w2v_model_sg = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=1)

In [24]:
def get_w2v_embeddings(model, sentences):
    return np.array([np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(model.vector_size)], axis=0) for sentence in sentences])

X_w2v_cbow = get_w2v_embeddings(w2v_model_cbow, tokenized_sentences)
X_w2v_sg = get_w2v_embeddings(w2v_model_sg, tokenized_sentences)

In [25]:
X_w2v_cbow

array([[-1.19785336e-03,  2.73047864e-01, -2.39131935e-02, ...,
        -4.36109185e-01,  3.35062854e-02,  2.16538310e-02],
       [-9.06412824e-05,  2.26658508e-01, -1.75846610e-02, ...,
        -3.64665151e-01,  2.92883366e-02,  1.58888213e-02],
       [ 1.65670167e-03,  2.58940816e-01, -1.89552307e-02, ...,
        -4.11310345e-01,  3.01752444e-02,  2.26086620e-02],
       ...,
       [ 9.49640945e-03, -4.15351847e-03,  9.47525259e-03, ...,
        -3.25483805e-03, -4.43130499e-03, -9.75177530e-03],
       [ 2.63799424e-03,  3.05080116e-01, -2.35623121e-02, ...,
        -4.87523764e-01,  3.33693251e-02,  2.36712657e-02],
       [ 4.71543474e-03,  2.43698835e-01, -2.06081383e-02, ...,
        -3.92173409e-01,  2.60883942e-02,  1.86806507e-02]])

In [26]:
X_w2v_sg

array([[ 0.0663745 ,  0.07068023,  0.0333822 , ..., -0.26407805,
         0.03448671,  0.06719125],
       [ 0.08701642,  0.05700462,  0.04260639, ..., -0.3011708 ,
         0.04454111,  0.0663107 ],
       [ 0.08238015,  0.07518212,  0.04479032, ..., -0.29147175,
         0.02598211,  0.08104239],
       ...,
       [ 0.00949641, -0.00415352,  0.00947525, ..., -0.00325484,
        -0.0044313 , -0.00975178],
       [ 0.14804503,  0.00846315,  0.0711785 , ..., -0.28758466,
         0.01871211,  0.10465664],
       [ 0.14307608,  0.00509189,  0.06607483, ..., -0.29411992,
         0.01885466,  0.08715545]])

### 3. Train your models by using SVR, Naive Bayes, Linear Regression , Decision Tree Algorithms (The embedding will be done by Word2Vec).

In [29]:
# Define target variable
y = data['score']

# Split the data
X_train_w2v_cbow, X_test_w2v_cbow, y_train, y_test = train_test_split(X_w2v_cbow, y, test_size=0.2, random_state=42)
X_train_w2v_sg, X_test_w2v_sg, y_train, y_test = train_test_split(X_w2v_sg, y, test_size=0.2, random_state=42)


In [33]:
# Initialize models
models = {
    'SVR': SVR(),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor()
}

In [34]:
# Function to train and evaluate models
def train_evaluate(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        results[name] = mse
    return results

In [35]:
# Evaluate models
results_cbow = train_evaluate(models, X_train_w2v_cbow, X_test_w2v_cbow, y_train, y_test)
results_sg = train_evaluate(models, X_train_w2v_sg, X_test_w2v_sg, y_train, y_test)

In [36]:
# Print results
print("Results using CBOW Word2Vec:")
print(results_cbow)
print("\nResults using Skip Gram Word2Vec:")
print(results_sg)

Results using CBOW Word2Vec:
{'SVR': 1.700611231571335, 'Linear Regression': 1.1131646775439066, 'Decision Tree': 2.0311618098159507}

Results using Skip Gram Word2Vec:
{'SVR': 1.4915537351871, 'Linear Regression': 1.0848127191439936, 'Decision Tree': 1.656768916155419}


### 4. Evaluate the four languages models by using standards metrics (MSE , RMSE, etc), choose the best model then argument your choice.

In [37]:
# Function to evaluate the best model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse, rmse

In [38]:
# best model using CBOW
best_model_name_cbow = min(results_cbow, key=results_cbow.get)
best_model_cbow = models[best_model_name_cbow]
mse_cbow, rmse_cbow = evaluate_model(best_model_cbow, X_test_w2v_cbow, y_test)
print(f"Best Model using CBOW: {best_model_name_cbow}")
print(f"MSE: {mse_cbow}, RMSE: {rmse_cbow}")

Best Model using CBOW: Linear Regression
MSE: 222.27310097328848, RMSE: 14.90882627752059


In [39]:
#  best model using Skip Gram
best_model_name_sg = min(results_sg, key=results_sg.get)
best_model_sg = models[best_model_name_sg]
mse_sg, rmse_sg = evaluate_model(best_model_sg, X_test_w2v_sg, y_test)
print(f"Best Model using Skip Gram: {best_model_name_sg}")
print(f"MSE: {mse_sg}, RMSE: {rmse_sg}")

Best Model using Skip Gram: Linear Regression
MSE: 1.0848127191439936, RMSE: 1.0415434312327037


Linear Regression demonstrated the lowest MSE and RMSE values across both CBOW and Skip Gram embeddings. Lower MSE and RMSE values indicate that the predictions made by Linear Regression are closer to the actual scores, signifying higher accuracy.

### 5. Interpret the Obtained Results.

Linear Regression stands out as the best model for this task due to its superior performance metrics, simplicity, and interpretability. The low MSE and RMSE values indicate accurate and reliable predictions, making it an excellent choice for automating the scoring of short answer questions. The insights gained from the model can also aid in enhancing the educational feedback process, providing valuable information for both educators and students.

## Part 2: Language Modeling / Classification

In [47]:
# Load the dataset
data2 = pd.read_csv("twitter_training.csv")


In [48]:
data2

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [55]:
# Provide column names manually (replace with actual column names)
columns = ["Tweet ID", "Entity", "Sentiment", "Tweet_content"]
data2.columns = columns

In [56]:
data2

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet_content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


### 1. Establish a preprocessing NLP pipeline (Tokenization stemming lemmatization, Stop words,Discretization, etc) off the collected Dataset

In [65]:
def preprocess(text):
    if isinstance(text, str):
        # Tokenization
        tokens = word_tokenize(text.lower())
        
        # Stop words removal
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        
        # Stemming and Lemmatization
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
        
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing
data2['processed_content'] = data2['Tweet_content'].apply(preprocess)

In [66]:
data2


Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet_content,processed_content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"come border kill ,"
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"im get borderland kill ,"
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"im come borderland murder ,"
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"im get borderland 2 murder ,"
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"im get borderland murder ,"
...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realiz window partit mac like 6 year behind nv...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realiz mac window partit 6 year behind nvidia ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realiz window partit mac 6 year behind nvidia ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,realiz window partit mac like 6 year behind nv...


In [67]:
# Define the clean_and_lowercase function
def clean(text):
    # Remove special characters, punctuation, and unnecessary symbols
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Return the cleaned text
    return cleaned_text

# Apply cleaning
data2['processed_content'] = data2['processed_content'].apply(clean)

In [68]:
data2

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet_content,processed_content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im get borderland murder
...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realiz window partit mac like year behind nvi...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realiz mac window partit year behind nvidia d...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realiz window partit mac year behind nvidia d...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,realiz window partit mac like year behind nvi...


### 2. Encode your Data vectors By using Word2vec (CBOW, Skip Gram), Bag Of words, TF-IDF.

In [71]:
# Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(data2['processed_content'])

In [73]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data2['processed_content'])

In [74]:
# Word2Vec
tokenized_sentences = [sentence.split() for sentence in data2['processed_content']]
w2v_model_cbow = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)
w2v_model_sg = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=1)

In [75]:
def get_w2v_embeddings(model, sentences):
    return np.array([np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(model.vector_size)], axis=0) for sentence in sentences])

In [76]:
X_w2v_cbow2 = get_w2v_embeddings(w2v_model_cbow, tokenized_sentences)
X_w2v_sg2 = get_w2v_embeddings(w2v_model_sg, tokenized_sentences)

In [77]:
X_w2v_cbow2

array([[-0.15772794,  0.17184432, -0.16873311, ..., -0.04625983,
         0.06885529,  0.20768587],
       [ 0.20528355, -0.11733584,  0.2324743 , ...,  0.64664876,
         0.02750003,  0.42811283],
       [ 0.26824495,  0.10146496,  0.11087167, ...,  0.16287251,
         0.01613133,  0.60663819],
       ...,
       [-0.32512957,  0.23342772,  0.06513016, ..., -0.35964876,
         0.43479013, -0.22112891],
       [-0.182026  ,  0.19504462,  0.09219808, ..., -0.16056563,
         0.40073925, -0.08028491],
       [-0.13186988,  0.00783164, -0.0214451 , ..., -0.28512508,
         0.41052717, -0.23579676]])

In [78]:
X_w2v_sg2

array([[-0.09454937,  0.14258991, -0.28380325, ..., -0.59560746,
         0.14900042, -0.24600224],
       [ 0.03235743, -0.06214147, -0.1253245 , ..., -0.23732491,
         0.2931959 , -0.25063777],
       [ 0.02390607, -0.12133534, -0.13656086, ..., -0.46497563,
         0.17607211, -0.13238272],
       ...,
       [ 0.02376175,  0.2764942 ,  0.01441227, ..., -0.00140629,
         0.108045  , -0.15695462],
       [ 0.06266104,  0.21679495,  0.00082671, ..., -0.01905343,
         0.03295258, -0.12132229],
       [-0.08128785,  0.15623444, -0.05475162, ..., -0.1111027 ,
        -0.06731769, -0.27045125]])

### 3. Train your models by using SVM, Naive Bayes, Logistic Regression, Ada Boosting Algorithms (The embedding will be done by Word2Vec).

In [79]:
y = data2['Sentiment']

X_train_w2v_cbow, X_test_w2v_cbow, y_train, y_test = train_test_split(X_w2v_cbow2, y, test_size=0.2, random_state=42)
X_train_w2v_sg, X_test_w2v_sg, y_train, y_test = train_test_split(X_w2v_sg2, y, test_size=0.2, random_state=42)

In [90]:
# Encode target labels with value between 0 and n_classes-1
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [86]:
# Define models
models = {
    'SVM': SVC(probability=True),  # probability=True to enable predict_proba
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'AdaBoost': AdaBoostClassifier(),
    'GaussianNB': GaussianNB()
}

In [91]:
def train_evaluate(models, X_train, X_test, y_train, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            proba_predictions = model.predict_proba(X_test)
            loss = log_loss(y_test, proba_predictions)
        else:
            loss = 'N/A'  # log_loss is not applicable if predict_proba is not available
        
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        
        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'log_loss': loss
        }
    return results

In [92]:
results_w2v_cbow = train_evaluate(models, X_train_w2v_cbow, X_test_w2v_cbow, y_train, y_test)
results_w2v_sg = train_evaluate(models, X_train_w2v_sg, X_test_w2v_sg, y_train, y_test)



### 4. Evaluate the four languages models by using standards metrics (Accuracy, Loss, F1 Score, etc) and other metrics like blue score, choose the best model then argument your choice .

In [94]:
print("Results using CBOW Word2Vec:")
for model_name, metrics in results_w2v_cbow.items():
    print(f"{model_name}: Accuracy = {metrics['accuracy']}, F1 Score = {metrics['f1_score']}, Log Loss = {metrics['log_loss']}")

Results using CBOW Word2Vec:
SVM: Accuracy = 0.5556001874539733, F1 Score = 0.5355931436614982, Log Loss = N/A
Logistic Regression: Accuracy = 0.5196491932784361, F1 Score = 0.49936412831160126, Log Loss = 1.134806378331216
AdaBoost: Accuracy = 0.48845149628439444, F1 Score = 0.4693316588668781, Log Loss = 1.376027518226499
GaussianNB: Accuracy = 0.45645042511883244, F1 Score = 0.459084092042814, Log Loss = 5.615844078866015


In [95]:
print("\nResults using Skip Gram Word2Vec:")
for model_name, metrics in results_w2v_sg.items():
    print(f"{model_name}: Accuracy = {metrics['accuracy']}, F1 Score = {metrics['f1_score']}, Log Loss = {metrics['log_loss']}")


Results using Skip Gram Word2Vec:
SVM: Accuracy = 0.6050746468501038, F1 Score = 0.5951686037001309, Log Loss = N/A
Logistic Regression: Accuracy = 0.5292896833366807, F1 Score = 0.5142167952155778, Log Loss = 1.109191432778566
AdaBoost: Accuracy = 0.4905938274084488, F1 Score = 0.47663146696554326, Log Loss = 1.3749419409901256
GaussianNB: Accuracy = 0.46475195822454307, F1 Score = 0.46450385589279747, Log Loss = 3.105669744941258


The SVM model with Skip Gram Word2Vec embeddings has the highest accuracy and F1 score among all tested models and configurations. This indicates that the SVM model is better at correctly classifying tweets and maintaining a balance between precision and recall.

Although Logistic Regression has a slightly lower accuracy and F1 score compared to SVM, it provides a low log loss value. Log loss is a crucial metric for evaluating the confidence of predictions. The lower the log loss, the better the model is at providing accurate probability estimates. This is important in scenarios where not only the classification but also the probability of belonging to each class is important

### 5. Interpret the Obtained Results.

SVM with Skipgram Word2Vec achieves the best overall performance in terms of accuracy and F1 score. The higher accuracy and F1 score suggest that the model is effective in handling the data's complexity and nuances.Skip Gram tends to work better in scenarios where the relationship between words (context) is more complex, possibly leading to better feature representations for SVM.

Logistic Regression provides probability estimates for each class, which is useful for understanding the model's confidence in its predictions. The relatively low log loss value indicates that the model's probability predictions are reliable.Logistic Regression is straightforward to interpret, making it a suitable choice when model transparency is required.

Both AdaBoost and GaussianNB show lower accuracy and F1 scores compared to SVM and Logistic Regression. GaussianNB particularly performs poorly with a very high log loss value in the CBOW setup, indicating it struggles with the given dataset and embeddings.