In [1]:
import nltk
import jupyter
import pandas as pd
import sklearn
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import re
import tabulate

## Exercise 1: Lowercase


In [2]:
list_ = ["This is my first NLP exercise", "wtf!!!!!"]
series_data = pd.Series(list_, name='text')

# Convert the series data to lowercase and uppercase
lowercase_texts = series_data.str.lower()

lowercase_texts

0    this is my first nlp exercise
1                         wtf!!!!!
Name: text, dtype: object

In [3]:
uppercase_texts = series_data.str.upper()

uppercase_texts

0    THIS IS MY FIRST NLP EXERCISE
1                         WTF!!!!!
Name: text, dtype: object

## Exercise 2: Punctuation

In [4]:
# Given sentence
sentence = "Remove, this from .? the sentence !!!! !\"#&'()*+,-./:;<=>_"

# Remove punctuation
sentence_no_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))

sentence_no_punctuation


'Remove this from  the sentence  '

## Exercise 3: Tokenization

In [5]:

# Given text
text = """Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto. The currency began use in 2009 when its implementation was released as open-source software."""

# Tokenize the text by sentences
sentences = sent_tokenize(text)

# Tokenize the text by words


sentences



['Bitcoin is a cryptocurrency invented in 2008 by an unknown person or group of people using the name Satoshi Nakamoto.',
 'The currency began use in 2009 when its implementation was released as open-source software.']

In [6]:
words = word_tokenize(text)

words


['Bitcoin',
 'is',
 'a',
 'cryptocurrency',
 'invented',
 'in',
 '2008',
 'by',
 'an',
 'unknown',
 'person',
 'or',
 'group',
 'of',
 'people',
 'using',
 'the',
 'name',
 'Satoshi',
 'Nakamoto',
 '.',
 'The',
 'currency',
 'began',
 'use',
 'in',
 '2009',
 'when',
 'its',
 'implementation',
 'was',
 'released',
 'as',
 'open-source',
 'software',
 '.']

## Exercise 4: Stop words

In [7]:

# Given text
text = """
The goal of this exercise is to learn to remove stop words with NLTK. Stop words usually refers to the most common words in a language.
"""

# Tokenize the text
word_tokens = word_tokenize(text)

# Load stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the tokenized words
filtered_sentence = [word for word in word_tokens if word.lower() not in stop_words]

print(filtered_sentence)

['goal', 'exercise', 'learn', 'remove', 'stop', 'words', 'NLTK', '.', 'Stop', 'words', 'usually', 'refers', 'common', 'words', 'language', '.']


## Exercise 5: Stemming


In [8]:
text = "The interviewer interviews the president in an interview"

# Initialize the Porter Stemmer
ps = PorterStemmer()

words = word_tokenize(text)

stemmed_words = [ps.stem(word) for word in words]

stemmed_words


['the', 'interview', 'interview', 'the', 'presid', 'in', 'an', 'interview']

## Exercise 6: Text preprocessing

In [9]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Filter out stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    return stemmed_tokens

text = "01 Edu System presents an innovative curriculum in software engineering and programming. With a renowned industry-leading reputation, the curriculum has been rigorously designed for learning skills of the digital world and technology industry. Taking a different approach than the classic teaching methods today, learning is facilitated through a collective and co-creative process in a professional environment."
proccessed_text = preprocess_text(text)
proccessed_text

['01',
 'edu',
 'system',
 'present',
 'innov',
 'curriculum',
 'softwar',
 'engin',
 'program',
 'renown',
 'industrylead',
 'reput',
 'curriculum',
 'rigor',
 'design',
 'learn',
 'skill',
 'digit',
 'world',
 'technolog',
 'industri',
 'take',
 'differ',
 'approach',
 'classic',
 'teach',
 'method',
 'today',
 'learn',
 'facilit',
 'collect',
 'cocr',
 'process',
 'profession',
 'environ']

## Exercise 7: Bag of Word representation

In [10]:
tweets = []
with open("tweets_train.txt", 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a tweet, so we'll preprocess each one and append it to our tweets list.
        # The file format includes the sentiment at the beginning, so we'll split on the first comma to separate it.
        sentiment, tweet_text = line.split(',', 1)
        tweets.append((sentiment.strip(), preprocess_text(tweet_text.strip())))

tweets_text = [" ".join(tweet[1]) for tweet in tweets]
sentiments = [tweet[0] for tweet in tweets]

# Initialize CountVectorizer with max_features=500
vectorizer = CountVectorizer(max_features=500)

# Fit the vectorizer to the tweets and transform the data
transformed_tweets = vectorizer.fit_transform(tweets_text)

# Check the shape of the word count matrix
transformed_tweets

<6588x500 sparse matrix of type '<class 'numpy.int64'>'
	with 37334 stored elements in Compressed Sparse Row format>

In [11]:
count_vectorized_df = pd.DataFrame.sparse.from_spmatrix(transformed_tweets)

# Retrieve feature names from the vectorizer and set them as column names
count_vectorized_df.columns = vectorizer.get_feature_names_out()

# Optionally, if you have the sentiments and wish to include them as a column in the DataFrame
count_vectorized_df['sentiment'] = sentiments

print(count_vectorized_df.iloc[:3,400:403].to_markdown())


|    |   someth |   son |   song |
|---:|---------:|------:|-------:|
|  0 |        0 |     0 |      0 |
|  1 |        0 |     0 |      0 |
|  2 |        0 |     0 |      0 |


In [12]:
fourth_tweet_token_counts = count_vectorized_df.iloc[3]


fourth_tweet_token_counts_1 = fourth_tweet_token_counts[fourth_tweet_token_counts == 1]

# Display the filtered token counts
print(fourth_tweet_token_counts_1)

cant    1
deal    1
end     1
find    1
keep    1
like    1
may     1
say     1
talk    1
Name: 3, dtype: Sparse[object, 0]


In [13]:
# Sort the sums in descending order to get the most frequent tokens


token_frequencies = count_vectorized_df.drop(columns=['sentiment']).sum().sort_values(ascending=False)

# Select the 15 most used tokens
top_15_tokens = token_frequencies.head(15)

top_15_tokens

tomorrow    1126
go           733
day          667
night        641
may          533
tonight      501
see          439
time         429
im           422
get          398
today        389
game         382
saturday     379
friday       375
sunday       368
dtype: int64

In [15]:
count_vectorized_df['label'] = sentiments
sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
count_vectorized_df['label'] = count_vectorized_df['label'].map(sentiment_mapping)

count_vectorized_df
print(count_vectorized_df.iloc[350:354,499:502].to_markdown())

|     |   your | sentiment   |   label |
|----:|-------:|:------------|--------:|
| 350 |      0 | positive    |       1 |
| 351 |      1 | negative    |      -1 |
| 352 |      0 | positive    |       1 |
| 353 |      0 | neutral     |       0 |
