# NLP - Text Classification Lab



## Basic Level

## Necessary Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
import re
from tqdm import tqdm

### Reading the Dataset

In [4]:
dataset=pd.read_csv("dataset/reviews.csv")
dataset=dataset[['sentiment', 'text']]
dataset = dataset.rename(columns={'sentiment': 'class'})
dataset


Unnamed: 0,class,text
0,neg,"Now, I won't deny that when I purchased this o..."
1,neg,"The saddest thing about this ""tribute"" is that..."
2,neg,Last night I decided to watch the prequel or s...
3,neg,I have to admit that i liked the first half of...
4,neg,I was not impressed about this film especially...
...,...,...
24995,pos,"This film is fun, if your a person who likes a..."
24996,pos,After seeing this film I feel like I know just...
24997,neg,first this deserves about 5 stars due to actin...
24998,neg,If you like films that ramble with little plot...


### Exercise 1
Split the dataset into two data structures (pandas frames), one for the reviews (our documents) (X) and one for the class (y).
Check that the lengths of both dataframes are equal

In [52]:
X = dataset['text']
Y = dataset['class']
print("Size of X is: " + str(len(X)) + " , size of y is: " + str(len(y)))

Size of X is: 25000 , size of y is: 25000


### Preprocessing
We now begin our preprocessing task, by lowercasing all of the documents, removing special characters and numerical values. Then we tokenize and stem our documents. 
### Exercise 2
Write a function to_lower(X) that takes a dataframe and returns its content in lower case.

In [53]:
def to_lower(X): 
    df = pd.DataFrame(X)
    df2 = df['text']
    df2 = df2.str.lower()

    return df2

In [54]:
X=to_lower(X)
X.head()

0    now, i won't deny that when i purchased this o...
1    the saddest thing about this "tribute" is that...
2    last night i decided to watch the prequel or s...
3    i have to admit that i liked the first half of...
4    i was not impressed about this film especially...
Name: text, dtype: object

### Exercise 3 
Write a function clean_text(X), that takes the dataset as an input and removes all special characters and numerical values from it.

In [55]:
import re
def clean_text(X):
    df = pd.DataFrame(X)
    df2 = df['text'].apply(lambda x: re.sub("[^a-z\s]", "", x))
    return df2

In [56]:
X=clean_text(X)

In [57]:
X

0        now i wont deny that when i purchased this off...
1        the saddest thing about this tribute is that a...
2        last night i decided to watch the prequel or s...
3        i have to admit that i liked the first half of...
4        i was not impressed about this film especially...
                               ...                        
24995    this film is fun if your a person who likes a ...
24996    after seeing this film i feel like i know just...
24997    first this deserves about  stars due to acting...
24998    if you like films that ramble with little plot...
24999    as interesting as a sheet of cardboard this di...
Name: text, Length: 25000, dtype: object

### Exercise 4
Write a function tokenize(X) that takes a dataframe and returns the tokens in each row (document).

In [58]:
!pip install nltk



In [59]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\XPS9360\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
from nltk.tokenize import word_tokenize
def tokenize(X):
    for i in range(len(X)):
        if isinstance(X[i], str):  # Check if the element is already a string
            X[i] = word_tokenize(X[i])
    return X

In [61]:
X = tokenize(X)
print(X)


0        [now, i, wont, deny, that, when, i, purchased,...
1        [the, saddest, thing, about, this, tribute, is...
2        [last, night, i, decided, to, watch, the, preq...
3        [i, have, to, admit, that, i, liked, the, firs...
4        [i, was, not, impressed, about, this, film, es...
                               ...                        
24995    [this, film, is, fun, if, your, a, person, who...
24996    [after, seeing, this, film, i, feel, like, i, ...
24997    [first, this, deserves, about, stars, due, to,...
24998    [if, you, like, films, that, ramble, with, lit...
24999    [as, interesting, as, a, sheet, of, cardboard,...
Name: text, Length: 25000, dtype: object


### Exercise 5
Write a function remove_stop_words(X, stop_words), that takes a dataset X and the set of stop words you want removed from it. Your function should return the dataset, free of any common words.

In [62]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
def remove_stop_words(X, stop_words):
    X = pd.Series(X)
    X = X.apply(lambda words_list: [word for word in words_list if word not in stop_words])
    return X





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XPS9360\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
X = remove_stop_words(X, stopwords.words('english'))
X




0        [wont, deny, purchased, ebay, high, expectatio...
1        [saddest, thing, tribute, almost, singers, inc...
2        [last, night, decided, watch, prequel, shall, ...
3        [admit, liked, first, half, sleepers, looked, ...
4        [impressed, film, especially, fact, went, cine...
                               ...                        
24995    [film, fun, person, likes, good, campy, featur...
24996    [seeing, film, feel, like, know, little, bit, ...
24997    [first, deserves, stars, due, acting, would, g...
24998    [like, films, ramble, little, plot, exposition...
24999    [interesting, sheet, cardboard, dispensable, p...
Name: text, Length: 25000, dtype: object

### Exercise 6
Write a function stem(X), that takes a dataframe X and returns the stems of all the words in it.

You're free to choose any stemmer you want.

It's also possible to use a lemmatizer (lemmatization will be a lot slower!).

In [64]:
from nltk.stem import PorterStemmer
import nltk
def stem(X):
    stemmer = PorterStemmer()
    X = X.apply(lambda words_list: [stemmer.stem(word) for word in words_list])
    return X

In [65]:
X = stem(X)



In [66]:
X


0        [wont, deni, purchas, ebay, high, expect, incr...
1        [saddest, thing, tribut, almost, singer, inclu...
2        [last, night, decid, watch, prequel, shall, sa...
3        [admit, like, first, half, sleeper, look, good...
4        [impress, film, especi, fact, went, cinema, fa...
                               ...                        
24995    [film, fun, person, like, good, campi, featur,...
24996    [see, film, feel, like, know, littl, bit, usa,...
24997    [first, deserv, star, due, act, would, give, b...
24998    [like, film, rambl, littl, plot, exposit, spic...
24999    [interest, sheet, cardboard, dispens, period, ...
Name: text, Length: 25000, dtype: object

### Exercise 7
Having called a tokenizer and a stemmer on our dataset, the resulting rows are now of type list.
We need to convert them back to str, as our CountVectorizer expects a dataset where every document is a string. 
Define a function to_String(X), that takes your dataset and stitches back its rows back to the str format.

In [67]:
def to_String(X):
    X = X.apply(lambda x: ' '.join(x))
    return X

In [68]:
X = to_String(X)



In [69]:
X


0        wont deni purchas ebay high expect incred outo...
1        saddest thing tribut almost singer includ othe...
2        last night decid watch prequel shall say call ...
3        admit like first half sleeper look good act ev...
4        impress film especi fact went cinema famili go...
                               ...                        
24995    film fun person like good campi featur film ev...
24996    see film feel like know littl bit usa david ly...
24997    first deserv star due act would give better su...
24998    like film rambl littl plot exposit spice kinki...
24999    interest sheet cardboard dispens period piec l...
Name: text, Length: 25000, dtype: object

In [70]:
a = X #a is used for exercise 16

### Vector Space Model
Now that we preprocessed our corpus, we can proceed to vectorize it.
### Exercise 8
We can now use the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) from sklearn to create our document-term-matrix.

a. Create a document-term matrix from your dataset X, use min_df and max_df parameters to exclude words that appear in less than 10 documents, and words that appear in more than 99.5% of the documents. We want to keep only words of medium frequency, as stated in the lecture.

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

def create_document_term_matrix(X):
    vectorizer = CountVectorizer(min_df=10, max_df=0.995) 
    dtm = vectorizer.fit_transform(X)
    pd.DataFrame(vectorizer.transform(X).toarray(), columns= vectorizer.get_feature_names_out())
    print(dtm)
    return vectorizer, dtm



In [72]:
vectorizer, X = create_document_term_matrix(X)




  (0, 13623)	1
  (0, 3078)	1
  (0, 9608)	1
  (0, 3696)	2
  (0, 5675)	1
  (0, 4134)	1
  (0, 6100)	1
  (0, 13638)	1
  (0, 7504)	1
  (0, 2326)	1
  (0, 3902)	2
  (0, 5868)	1
  (0, 11318)	1
  (0, 3287)	1
  (0, 521)	1
  (0, 4668)	1
  (0, 243)	3
  (0, 3236)	1
  (0, 13347)	1
  (0, 5145)	2
  (0, 11189)	1
  (0, 11896)	1
  (0, 7364)	1
  (0, 4805)	1
  (0, 1688)	1
  :	:
  (24999, 1947)	1
  (24999, 4358)	1
  (24999, 7097)	1
  (24999, 4226)	1
  (24999, 4070)	1
  (24999, 5072)	1
  (24999, 9110)	1
  (24999, 5408)	1
  (24999, 9011)	1
  (24999, 4728)	1
  (24999, 8087)	1
  (24999, 12163)	1
  (24999, 10866)	1
  (24999, 9812)	1
  (24999, 3971)	1
  (24999, 4327)	1
  (24999, 10131)	1
  (24999, 8732)	1
  (24999, 3424)	1
  (24999, 11402)	1
  (24999, 1752)	1
  (24999, 6587)	1
  (24999, 13634)	1
  (24999, 10273)	1
  (24999, 3357)	1


b. Print the size and the contents of your vocab (feature space)

In [73]:
vocabulary = vectorizer.get_feature_names_out()
vocabulary_size = len(vocabulary)
print("Vocabulary Size:", vocabulary_size)
print("Vocabulary Contents:")
print("First 10 Words in Vocabulary:")
print(vocabulary[:10])



Vocabulary Size: 13823
Vocabulary Contents:
First 10 Words in Vocabulary:
['aag' 'aam' 'aaron' 'ab' 'abandon' 'abba' 'abbey' 'abbi' 'abbot' 'abbott']


### Training our Logistic Regressor

### Exercise 9:
Before we dive into training our model, let's get our vector of true labels **y** into the right format.
Notice that by printing the contents of **y** below, what we get are the labels **neg** and **pos**. 
The model works only with **1 and 0**.
Let's convert the labels accordingly.

In [74]:
y = pd.DataFrame(Y)
y[:10]


Unnamed: 0,class
0,neg
1,neg
2,neg
3,neg
4,neg
5,pos
6,pos
7,neg
8,pos
9,neg


a. Use the [label_ encoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) from Sklearn, to transform the labels in the vector **y** accordingly.

In [75]:
from sklearn.preprocessing import LabelEncoder
num_rows = len(y)
print(num_rows)
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

25000


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [76]:
y[:10]


array([0, 0, 0, 0, 0, 1, 1, 0, 1, 0])

We also create a test and train set from our DTM and our vector y.

In [77]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

### Exercise 10
Use the [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) classifier from Sklearn to train your model, and check its accuracy on the test set

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(random_state=100)
model.fit(train_X, train_y)
train_predictions = model.predict(train_X)
test_predictions = model.predict(test_X)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
train_accuracy = accuracy_score(train_y, train_predictions)
print("Train Accuracy:", train_accuracy)
test_accuracy = accuracy_score(test_y, test_predictions)
print("Test Accuracy:", test_accuracy)



Train Accuracy: 0.9942933333333334
Test Accuracy: 0.85888


**Our results show a clear sign of overfitting**

## Regular 

Our main goal in the regular exercises is to provide a basic implementation for the logistic regression model.


We'll first define a function **initialize(X)** to get the initial vector of weights W and bias b.

We will then do our **forward_pass(X, W, b)** to get a vector of predictions.

We then write a function **gradient_descent(X, W, b, y, lr)** to get the updated vector of weights W and bias b.


We conclude this part by writing the model function, which calls all of the functions we defined, and proceed with the learning.

### Exercise 11
Write a function **initialize(X)**, which takes a DTM as an input and returns a vector of weights W and a scalar b for the bias. Both are initialized with some random values.

In [80]:
train_X=train_X.toarray()
train_y=np.array(train_y)
train_y=train_y.reshape(train_y.shape[0],1)

In [81]:
def initialize(X):
    num_features = X.shape[1] 
    # Initialize weights with random values
    W = np.random.uniform(-1,1,[num_features,1])  
    b = np.random.uniform(-1,1)
    return (W, b)

In [82]:
W,b=initialize(train_X)
print("Shape of the vector W is:",W.shape)

Shape of the vector W is: (13823, 1)


### Exercise 12
Write a function **forward_pass(X, W, b)** which takes the DTM X, the vector W and the bias b as inputs and returns a vector of predictions P.

Your function should implement the following equations

$$Z=X.W + b$$
$$P=\sigma{(Z)}=\frac{1}{1+e^{-Z}}$$

You can also implement the sigmoid as a separate function, 

In [83]:
import numpy as np
from scipy.sparse import issparse
def forward_pass(X, W, b):
    if issparse(X):
        X = X.toarray()
    Z = np.dot(X, W) + b
    P = 1 / (1 + np.exp(-Z))
    return P

In [84]:
P=forward_pass(train_X, W, b)
print("The vector of predictions shape is:",P.shape)


The vector of predictions shape is: (18750, 1)


### Exercise 13
Calculating the loss/cost function
You can use the [implementation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html) by Sklearn, but feel free to also implement your own.

In [85]:
from sklearn.metrics import log_loss

# predict the probabilities of the positive class
y_prob = model.predict_proba(test_X)[:, 1]
loss = log_loss(test_y, y_prob)

print("Log Loss:", loss)


Log Loss: 0.46933045662909223


### Exercise 14
Write a function **gradient_descent(X, W, b , P, y, lr)** and returns the updated weight vector W, and bias b.
Your function needs to implement the following equations:

$$dW=\frac{1}{ne}X^T . (P-y)$$

$$db=\frac{1}{ne} \sum(P-y)$$

$$W=W-\alpha dW$$

$$W=b-\alpha db$$

In [86]:
def gradient_descent(X, W, b, P, y, lr):
    # Number of samples
    n = len(y)
    dW = (1 / n) * np.dot(X.T, (P - y))
    db = (1 / n) * np.sum(P - y)
    W = W - lr * dW
    b = b - lr * db
    return W, b

In [87]:
W, b = gradient_descent(train_X, W, b, P, train_y, lr=0.2)
print("shape of the updated W is:", W.shape)



shape of the updated W is: (13823, 1)


### Exercise 15
Now we can implement our logistic regression model in 3 simple steps
1. initialize the vector W and the bias b

2. repeat until number of iterations is reached   
    2.1. get a vector of predictions P  
    2.2. update the weights W and bias b using gradient descent  
    
3. return the final vector W and bias b

**logistic_regression(X, y, lr, iters)**, takes the matrix X as an input, the vector of true labels y, a learning rate, and the number of iterations iters. The function returns the learned parameters of the model, namely W and b.

In [88]:
from tqdm import tqdm
def logistic_regression(X, y, lr, iters):
    W,b = initialize(X)
    print(W)
    for i in tqdm(range(iters)):
        P = forward_pass(X, W, b)
        W, b = gradient_descent(train_X, W, b, P, y, lr)
        
    return W, b

In [89]:
W, b= logistic_regression(train_X, train_y, lr=1.5, iters=400)

[[ 0.66413928]
 [-0.5374101 ]
 [ 0.97096081]
 ...
 [-0.69652637]
 [ 0.4793068 ]
 [-0.73477369]]


100%|██████████| 400/400 [13:21<00:00,  2.00s/it]


### Testing our model
We have been able to implement the model and run it on our training set. It's time to see how well it does. 
We'll first make a function **predict(X, W, b)**, that takes the dataset and the learned parameters and returns an array of predictions. Our threshold is 0.5, any prediction below that is returned as 0, and any above it are returned as 1.

In [90]:
def predict(X, W, b):
    P = forward_pass(X, W, b)
    P=1*(P >= 0.5)
    return P

In [91]:
P_train=predict(train_X, W, b)

In [92]:
print("Accuracy on our train set is, ",accuracy_score( train_y, P_train)*100, "%")

Accuracy on our train set is,  87.94133333333333 %


In [93]:
P_test=predict(test_X, W, b)

In [94]:
print("Accuracy on our test set is", accuracy_score(test_y, P_test)*100, "%")

Accuracy on our test set is 82.896 %


## Advanced

In this part, we set to understand what did the model actually learn.

### Exercise 16
Using the CountVectorizer of Sklearn, recreate a pandas frame where the rows contain the documents and the columns contain the features. 



In [95]:
#CountVectorizer expects a list of text document as input, not a DataFrame
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
def create_document_term_matrix1(X):
    vectorizer = CountVectorizer(min_df=10, max_df=0.995) 
    dtm = vectorizer.fit_transform(a)
    dtm_df = pd.DataFrame(vectorizer.transform(a).toarray(), columns= vectorizer.get_feature_names_out())
    print(dtm_df)
    return vectorizer, dtm_df

In [96]:
vectorizer, dtm_df = create_document_term_matrix1(X)

       aag  aam  aaron  ab  abandon  abba  abbey  abbi  abbot  abbott  ...  \
0        0    0      0   0        0     0      0     0      0       0  ...   
1        0    0      0   0        0     0      0     0      0       0  ...   
2        0    0      0   0        0     0      0     0      0       0  ...   
3        0    0      0   0        0     0      0     0      0       0  ...   
4        0    0      0   0        1     0      0     0      0       0  ...   
...    ...  ...    ...  ..      ...   ...    ...   ...    ...     ...  ...   
24995    0    0      0   0        0     0      0     0      0       0  ...   
24996    0    0      0   0        0     0      0     0      0       0  ...   
24997    0    0      0   0        0     0      0     0      0       0  ...   
24998    0    0      0   0        0     0      0     0      0       0  ...   
24999    0    0      0   0        0     0      0     0      0       0  ...   

       zombi  zombiesbr  zone  zoo  zoom  zorro  zu  zucker  zu

### Exercise 18
Knowing that our logistic regressor learns a weight for each feature (word in the vocab),  return the words with the highest weights (5 highest), and the words with lowest weights (5 lowest).

In [97]:
coefficients = model.coef_[0]
word_weight_pairs = [(word, weight) for word, weight in zip(vectorizer.get_feature_names_out(), coefficients)]
sorted_word_weight_pairs = sorted(word_weight_pairs, key=lambda x: x[1])
top_5_words = sorted_word_weight_pairs[-5:]
bottom_5_words = sorted_word_weight_pairs[:5]
print("Top 5 words with highest weights:")
for word, weight in top_5_words:
    print(f"{word}: {weight}")

print("\nBottom 5 words with lowest weights:")
for word, weight in bottom_5_words:
    print(f"{word}: {weight}")


    

Top 5 words with highest weights:
mj: 1.422636687043491
kurosawa: 1.4536169298882846
flawless: 1.616274122921176
excel: 1.660043469452013
refresh: 1.7238773820786502

Bottom 5 words with lowest weights:
worst: -2.3043585428276634
poorli: -2.1275280847292293
wast: -2.030525269106648
alright: -1.7525070007953822
mstk: -1.6719868518860082


### Exercise 19
Print the weights of the words "good" and "bad"

In [98]:
good_weight = None
bad_weight = None
for word, weight in word_weight_pairs:
    if word == "good":
        good_weight = weight
    elif word == "bad":
        bad_weight = weight

print("Weight of 'good':", good_weight)
print("Weight of 'bad':", bad_weight)

Weight of 'good': 0.2337061156360339
Weight of 'bad': -0.7694501741865307
