## NLP LAB 3

### Objective : The main purpose behind this lab is to get familiar with NLP language models using Sklearn library.


#### Part 2 Language Modeling / Classification

Importing the necessary librarys

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from gensim.models import Word2Vec


Downloading the necessary nltk data

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

As the data don't have a column name we have given them a one 

In [3]:
train_df = pd.read_csv('twitter_training.csv', header=None, names=['ID', 'Platform', 'Sentiment', 'Text'])  # Update with your train dataset path
valid_df = pd.read_csv('twitter_validation.csv', header=None, names=['ID', 'Platform', 'Sentiment', 'Text'])  # Update with your validation dataset path


Droping the NAN values

In [4]:
train_df.dropna(inplace=True)
valid_df.dropna(inplace=True)

Defining the process Function 

In [5]:
def preprocess(text):
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    text = text.lower()
    
    # Tokenization
    text = text.split()
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    text = ' '.join(text)
    
    return text

In [6]:
train_df['cleaned_text'] = train_df['Text'].apply(preprocess)
valid_df['cleaned_text'] = valid_df['Text'].apply(preprocess)

Splliting the data into test and training 

In [7]:
X_train_full = train_df['cleaned_text']
y_train_full = train_df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

Validation Data

In [8]:
X_valid = valid_df['cleaned_text']
y_valid = valid_df['Sentiment']

Feature Extraction using TF-IDF

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Feature Extraction Using BOF

In [10]:
count_vectorizer = CountVectorizer(max_features=5000)
X_train_bow = count_vectorizer.fit_transform(X_train)
X_valid_bow = count_vectorizer.transform(X_valid)
X_test_bow = count_vectorizer.transform(X_test)

Feature Extraction Using W2V

In [11]:
X_train_word2vec = [text.split() for text in X_train]
X_valid_word2vec = [text.split() for text in X_valid]
X_test_word2vec = [text.split() for text in X_test]
word2vec_model = Word2Vec(X_train_word2vec, vector_size=100, window=5, min_count=2, workers=4)
X_train_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in X_train_word2vec])
X_valid_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in X_valid_word2vec])
X_test_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in X_test_word2vec])


Model Training & Evaluation 

In [12]:
def train_and_evaluate_model(model, X_train, X_valid, X_test, y_train, y_valid, y_test):
    model.fit(X_train, y_train)
    y_pred_valid = model.predict(X_valid)
    y_pred_test = model.predict(X_test)
    
    accuracy_valid = accuracy_score(y_valid, y_pred_valid)
    f1_valid = f1_score(y_valid, y_pred_valid, average='weighted')
    
    accuracy_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test, average='weighted')
    
    print(f'Validation Accuracy: {accuracy_valid}')
    print(f'Validation F1 Score: {f1_valid}')
    print(confusion_matrix(y_valid, y_pred_valid))
    print(classification_report(y_valid, y_pred_valid))
    
    print(f'Test Accuracy: {accuracy_test}')
    print(f'Test F1 Score: {f1_test}')
    print(confusion_matrix(y_test, y_pred_test))
    print(classification_report(y_test, y_pred_test))

In [13]:
print("Naive Bayes with TF-IDF:")
train_and_evaluate_model(MultinomialNB(), X_train_tfidf, X_valid_tfidf, X_test_tfidf, y_train, y_valid, y_test)

Naive Bayes with TF-IDF:
Validation Accuracy: 0.707
Validation F1 Score: 0.7020818765440422
[[ 88  34  12  38]
 [  1 221  20  24]
 [ 12  57 173  43]
 [  3  29  20 225]]
              precision    recall  f1-score   support

  Irrelevant       0.85      0.51      0.64       172
    Negative       0.65      0.83      0.73       266
     Neutral       0.77      0.61      0.68       285
    Positive       0.68      0.81      0.74       277

    accuracy                           0.71      1000
   macro avg       0.74      0.69      0.70      1000
weighted avg       0.73      0.71      0.70      1000

Test Accuracy: 0.6331081081081081
Test F1 Score: 0.6191323675481166
[[ 886  735  316  759]
 [  71 3519  308  482]
 [ 145  797 1916  747]
 [  89  641  340 3049]]
              precision    recall  f1-score   support

  Irrelevant       0.74      0.33      0.46      2696
    Negative       0.62      0.80      0.70      4380
     Neutral       0.67      0.53      0.59      3605
    Positive      

In [14]:
print("Logistic Regression with TF-IDF:")
train_and_evaluate_model(LogisticRegression(), X_train_tfidf, X_valid_tfidf, X_test_tfidf, y_train, y_valid, y_test)


Logistic Regression with TF-IDF:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.81
Validation F1 Score: 0.8091765188148115
[[121  19  10  22]
 [  4 235  14  13]
 [ 12  35 223  15]
 [ 11  19  16 231]]
              precision    recall  f1-score   support

  Irrelevant       0.82      0.70      0.76       172
    Negative       0.76      0.88      0.82       266
     Neutral       0.85      0.78      0.81       285
    Positive       0.82      0.83      0.83       277

    accuracy                           0.81      1000
   macro avg       0.81      0.80      0.80      1000
weighted avg       0.81      0.81      0.81      1000

Test Accuracy: 0.6831081081081081
Test F1 Score: 0.6785814782219962
[[1353  522  349  472]
 [ 160 3519  326  375]
 [ 212  608 2240  545]
 [ 210  526  385 2998]]
              precision    recall  f1-score   support

  Irrelevant       0.70      0.50      0.58      2696
    Negative       0.68      0.80      0.74      4380
     Neutral       0.68      0.62      0.65      3605
    Positive       0.68      0.73      0.70 

In [15]:
print("AdaBoost with TF-IDF:")
train_and_evaluate_model(AdaBoostClassifier(), X_train_tfidf, X_valid_tfidf, X_test_tfidf, y_train, y_valid, y_test)


AdaBoost with TF-IDF:
Validation Accuracy: 0.483
Validation F1 Score: 0.450813740497023
[[ 12  92  32  36]
 [  6 211  31  18]
 [  3 113 123  46]
 [  4 104  32 137]]
              precision    recall  f1-score   support

  Irrelevant       0.48      0.07      0.12       172
    Negative       0.41      0.79      0.54       266
     Neutral       0.56      0.43      0.49       285
    Positive       0.58      0.49      0.53       277

    accuracy                           0.48      1000
   macro avg       0.51      0.45      0.42      1000
weighted avg       0.51      0.48      0.45      1000

Test Accuracy: 0.4685135135135135
Test F1 Score: 0.4314811494333045
[[ 234 1593  293  576]
 [  88 3605  365  322]
 [  48 1740 1269  548]
 [  53 1886  354 1826]]
              precision    recall  f1-score   support

  Irrelevant       0.55      0.09      0.15      2696
    Negative       0.41      0.82      0.55      4380
     Neutral       0.56      0.35      0.43      3605
    Positive       0.5

In [16]:
print("Naive Bayes with Bag of Words:")
train_and_evaluate_model(MultinomialNB(), X_train_bow, X_valid_bow, X_test_bow, y_train, y_valid, y_test)


Naive Bayes with Bag of Words:
Validation Accuracy: 0.705
Validation F1 Score: 0.7021519169685826
[[104  27  12  29]
 [ 11 207  24  24]
 [ 21  45 172  47]
 [  6  27  22 222]]
              precision    recall  f1-score   support

  Irrelevant       0.73      0.60      0.66       172
    Negative       0.68      0.78      0.72       266
     Neutral       0.75      0.60      0.67       285
    Positive       0.69      0.80      0.74       277

    accuracy                           0.70      1000
   macro avg       0.71      0.70      0.70      1000
weighted avg       0.71      0.70      0.70      1000

Test Accuracy: 0.6288513513513514
Test F1 Score: 0.6223075619154208
[[1199  550  304  643]
 [ 214 3287  374  505]
 [ 318  691 1860  736]
 [ 223  567  368 2961]]
              precision    recall  f1-score   support

  Irrelevant       0.61      0.44      0.52      2696
    Negative       0.65      0.75      0.69      4380
     Neutral       0.64      0.52      0.57      3605
    Positive

In [17]:
print("Logistic Regression with Bag of Words:")
train_and_evaluate_model(LogisticRegression(), X_train_bow, X_valid_bow, X_test_bow, y_train, y_valid, y_test)


Logistic Regression with Bag of Words:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.838
Validation F1 Score: 0.8377441738960172
[[132  11   6  23]
 [  5 237   9  15]
 [ 11  22 233  19]
 [ 12  14  15 236]]
              precision    recall  f1-score   support

  Irrelevant       0.82      0.77      0.80       172
    Negative       0.83      0.89      0.86       266
     Neutral       0.89      0.82      0.85       285
    Positive       0.81      0.85      0.83       277

    accuracy                           0.84      1000
   macro avg       0.84      0.83      0.83      1000
weighted avg       0.84      0.84      0.84      1000

Test Accuracy: 0.7066891891891892
Test F1 Score: 0.7040570919399934
[[1504  360  300  532]
 [ 152 3446  296  486]
 [ 226  416 2289  674]
 [ 219  339  341 3220]]
              precision    recall  f1-score   support

  Irrelevant       0.72      0.56      0.63      2696
    Negative       0.76      0.79      0.77      4380
     Neutral       0.71      0.63      0.67      3605
    Positive       0.66      0.78      0.71

In [18]:
print("AdaBoost with Bag of Words:")
train_and_evaluate_model(AdaBoostClassifier(), X_train_bow, X_valid_bow, X_test_bow, y_train, y_valid, y_test)


AdaBoost with Bag of Words:
Validation Accuracy: 0.483
Validation F1 Score: 0.44756014772991
[[ 11  85  34  42]
 [  4 210  27  25]
 [  4 101 121  59]
 [  2 101  33 141]]
              precision    recall  f1-score   support

  Irrelevant       0.52      0.06      0.11       172
    Negative       0.42      0.79      0.55       266
     Neutral       0.56      0.42      0.48       285
    Positive       0.53      0.51      0.52       277

    accuracy                           0.48      1000
   macro avg       0.51      0.45      0.42      1000
weighted avg       0.51      0.48      0.45      1000

Test Accuracy: 0.47047297297297297
Test F1 Score: 0.4320028484998988
[[ 218 1542  310  626]
 [  79 3569  313  419]
 [  64 1655 1242  644]
 [  41 1814  330 1934]]
              precision    recall  f1-score   support

  Irrelevant       0.54      0.08      0.14      2696
    Negative       0.42      0.81      0.55      4380
     Neutral       0.57      0.34      0.43      3605
    Positive    

In [19]:
print("Logistic Regression with Word2Vec:")
train_and_evaluate_model(LogisticRegression(), X_train_word2vec, X_valid_word2vec, X_test_word2vec, y_train, y_valid, y_test)


Logistic Regression with Word2Vec:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.55
Validation F1 Score: 0.5323963836775917
[[ 33  51  36  52]
 [ 11 187  36  32]
 [ 13  68 151  53]
 [ 11  50  37 179]]
              precision    recall  f1-score   support

  Irrelevant       0.49      0.19      0.28       172
    Negative       0.53      0.70      0.60       266
     Neutral       0.58      0.53      0.55       285
    Positive       0.57      0.65      0.60       277

    accuracy                           0.55      1000
   macro avg       0.54      0.52      0.51      1000
weighted avg       0.55      0.55      0.53      1000

Test Accuracy: 0.519054054054054
Test F1 Score: 0.5043926950982146
[[ 540  790  543  823]
 [ 303 3003  475  599]
 [ 320  791 1717  777]
 [ 326  755  616 2422]]
              precision    recall  f1-score   support

  Irrelevant       0.36      0.20      0.26      2696
    Negative       0.56      0.69      0.62      4380
     Neutral       0.51      0.48      0.49      3605
    Positive       0.52      0.59      0.55  

In [20]:
print("AdaBoost with Word2Vec:")
train_and_evaluate_model(AdaBoostClassifier(), X_train_word2vec, X_valid_word2vec, X_test_word2vec, y_train, y_valid, y_test)


AdaBoost with Word2Vec:
Validation Accuracy: 0.509
Validation F1 Score: 0.4986764981442086
[[ 42  50  36  44]
 [ 18 178  34  36]
 [ 21  68 141  55]
 [ 14  62  53 148]]
              precision    recall  f1-score   support

  Irrelevant       0.44      0.24      0.31       172
    Negative       0.50      0.67      0.57       266
     Neutral       0.53      0.49      0.51       285
    Positive       0.52      0.53      0.53       277

    accuracy                           0.51      1000
   macro avg       0.50      0.49      0.48      1000
weighted avg       0.51      0.51      0.50      1000

Test Accuracy: 0.4983108108108108
Test F1 Score: 0.4850975271239729
[[ 581  808  478  829]
 [ 255 2824  523  778]
 [ 286  792 1579  948]
 [ 285  805  638 2391]]
              precision    recall  f1-score   support

  Irrelevant       0.41      0.22      0.28      2696
    Negative       0.54      0.64      0.59      4380
     Neutral       0.49      0.44      0.46      3605
    Positive       

In [21]:
print("SVM with TF-IDF:")
train_and_evaluate_model(SVC(), X_train_tfidf, X_valid_tfidf, X_test_tfidf, y_train, y_valid, y_test)

SVM with TF-IDF:
Validation Accuracy: 0.966
Validation F1 Score: 0.9660746179549231
[[165   2   1   4]
 [  0 260   1   5]
 [  1   2 273   9]
 [  4   1   4 268]]
              precision    recall  f1-score   support

  Irrelevant       0.97      0.96      0.96       172
    Negative       0.98      0.98      0.98       266
     Neutral       0.98      0.96      0.97       285
    Positive       0.94      0.97      0.95       277

    accuracy                           0.97      1000
   macro avg       0.97      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000

Test Accuracy: 0.895472972972973
Test F1 Score: 0.8953878241992632
[[2204  133  112  247]
 [  49 4039   75  217]
 [  46  120 3145  294]
 [  51   93  110 3865]]
              precision    recall  f1-score   support

  Irrelevant       0.94      0.82      0.87      2696
    Negative       0.92      0.92      0.92      4380
     Neutral       0.91      0.87      0.89      3605
    Positive       0.84    

In [None]:
print("SVM with Bag of Words:")
train_and_evaluate_model(SVC(), X_train_bow, X_valid_bow, X_test_bow, y_train, y_valid, y_test)

SVM with Bag of Words:
