# Exercise 7: Bag of words

In [1]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Preprocessing function from Exercise 6
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)


In [6]:
# Load tweets from file
def load_tweets(filename):
    """
    Load tweets from tweets_train.txt
    Format: sentiment, tweet_text
    """
    tweets = []
    labels = []
    
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                # Split only on first comma to preserve commas in tweet text
                parts = line.split(', ', 1)
                if len(parts) == 2:
                    sentiment, tweet = parts
                    tweets.append(tweet)
                    
                    # Convert sentiment to numeric label
                    if sentiment == 'positive':
                        labels.append(1)
                    elif sentiment == 'negative':
                        labels.append(-1)
                    else:  # neutral
                        labels.append(0)
    
    return tweets, labels
# data_url = "https://learn.zone01kisumu.ke/api/content/root/public/subjects/ai/nlp/resources/tweets_train.txt"
# Load the data
tweets, labels = load_tweets('tweets_train.txt')

print(f"Loaded {len(tweets)} tweets")
print(f"\nFirst 3 tweets:")
for i in range(3):
    print(f"{i+1}. [{labels[i]}] {tweets[i][:60]}...")

# Preprocess all tweets
preprocessed_tweets = [preprocess_text(tweet) for tweet in tweets]

print(f"\nExample preprocessing:")
print(f"Original: {tweets[0]}")
print(f"Preprocessed: {preprocessed_tweets[0]}")

Loaded 6588 tweets

First 3 tweets:
1. [1] Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sa...
2. [-1] Theo Walcott is still shit, watch Rafa and Johnny deal with ...
3. [-1] its not that I'm a GSP fan, i just hate Nick Diaz. can't wai...

Example preprocessing:
Original: Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)
Preprocessed: ga hous hit 339 im go chapel hill sat


In [7]:
# Create CountVectorizer with max_features=500
vectorizer = CountVectorizer(max_features=500)
X = vectorizer.fit_transform(preprocessed_tweets)

print(f"\nShape of word count matrix: {X.shape}")
print(f"(rows: {X.shape[0]} tweets, columns: {X.shape[1]} features)")


Shape of word count matrix: (6588, 500)
(rows: 6588 tweets, columns: 500 features)


In [8]:
 # Create DataFrame from sparse matrix
count_vectorized_df = pd.DataFrame.sparse.from_spmatrix(
    X, 
    columns=vectorizer.get_feature_names_out()
)

print("\nWord count DataFrame (first 5 rows, first 10 columns):")
print(count_vectorized_df.iloc[:5, :10])



Word count DataFrame (first 5 rows, first 10 columns):
   10  10th  12  15  15th  16  19th  1st  2012  22
0   0     0   0   0     0   0     0    0     0   0
1   0     0   0   0     0   0     0    0     0   0
2   0     0   0   0     0   0     0    0     0   0
3   0     0   0   0     0   0     0    0     0   0
4   0     0   0   0     0   0     0    0     0   0


In [9]:
# Show token counts of the 4th tweet (index 3)
print("\n--- Token counts of the 4th tweet ---")
fourth_tweet_counts = count_vectorized_df.iloc[3][count_vectorized_df.iloc[3] > 0]
print(f"Original tweet: {tweets[3]}")
print(f"\nWord counts:")
print(fourth_tweet_counts.sort_values(ascending=False))


--- Token counts of the 4th tweet ---
Original tweet: Iranian general says Israel's Iron Dome can't deal with their missiles (keep talking like that and we may end up finding out)

Word counts:
cant    1
deal    1
end     1
find    1
keep    1
like    1
may     1
say     1
talk    1
Name: 3, dtype: Sparse[int64, 0]


In [10]:
# Show 15 most used words across all tweets
word_counts = count_vectorized_df.sum(axis=0).sort_values(ascending=False)
print("\n--- 15 most used tokenized words ---")
print(word_counts.head(15))


--- 15 most used tokenized words ---
tomorrow    1126
go           733
day          667
night        641
may          533
tonight      501
see          439
time         429
im           422
get          398
today        389
game         382
saturday     379
friday       375
sunday       368
dtype: Sparse[int64, 0]


In [11]:
# Add label column
count_vectorized_df['label'] = labels

print("\n--- Final DataFrame with labels ---")
print(count_vectorized_df.head(10))


--- Final DataFrame with labels ---
   10  10th  12  15  15th  16  19th  1st  2012  22  ...  would  yall  ye  \
0   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
1   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
2   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
3   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
4   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
5   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
6   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
7   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
8   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   
9   0     0   0   0     0   0     0    0     0   0  ...      0     0   0   

   year  yesterday  yet  york  young  your  label  
0     0          0    0     0      0     0      1  
1     0          0    

In [12]:
print("\n--- Label distribution ---")
label_dist = count_vectorized_df['label'].value_counts()
print(label_dist)
print(f"\nPositive: {label_dist.get(1, 0)}")
print(f"Neutral: {label_dist.get(0, 0)}")
print(f"Negative: {label_dist.get(-1, 0)}")

# Show sample of final structure
print("\n--- Sample rows from final dataset ---")
print(count_vectorized_df[['label']].join(
    count_vectorized_df.iloc[:, :5]
).head())


--- Label distribution ---
label
 0    3236
 1    2413
-1     939
Name: count, dtype: int64

Positive: 2413
Neutral: 3236
Negative: 939

--- Sample rows from final dataset ---
   label  10  10th  12  15  15th
0      1   0     0   0   0     0
1     -1   0     0   0   0     0
2     -1   0     0   0   0     0
3     -1   0     0   0   0     0
4      1   0     0   0   0     0
