In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import shutil
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
!pip install chardet
import chardet

with open('/content/drive/My Drive/NLP/lab 3/bbc-text.csv', 'rb') as f:
  result = chardet.detect(f.read())

df = pd.read_csv('/content/drive/My Drive/NLP/lab 3/bbc-text.csv', encoding=result['encoding'])



In [37]:
text_column = df['text']#[:20]
print(text_column)

0       tv future in the hands of viewers with home th...
1       worldcom boss  left books alone  former worldc...
2       tigers wary of farrell  gamble  leicester say ...
3       yeading face newcastle in fa cup premiership s...
4       ocean s twelve raids box office ocean s twelve...
                              ...                        
2220    cars pull down us retail figures us retail sal...
2221    kilroy unveils immigration policy ex-chatshow ...
2222    rem announce new glasgow concert us band rem h...
2223    how political squabbles snowball it s become c...
2224    souness delight at euro progress boss graeme s...
Name: text, Length: 2225, dtype: object


In [38]:
def remove_special_characters(text):
  characters_to_remove = ['.', ',', '!', '?', '#', '$', '%', '&', '*', '(', ')',
                        '+', '=', '-', '_', '[', ']', '{', '}', ';', ':', '\'',
                        '"', '/', '\\', '|', '<', '>', '`', '~']
  for char in characters_to_remove:
    text = text.replace(char, '')
  return text

# text_without_specific_characters = []
# for text in text_column:
#   text_without_specific_characters.append(
#       remove_special_characters(str(text)))

# print(text_without_specific_characters)

In [39]:
def to_lower(text):
  return text.lower()

# text_lower = []
# for text in text_column:
#   text_lower.append(to_lower(str(text)))

# print(text_lower)

In [40]:
def remove_stop_words(text):
  stop_words = set(stopwords.words('english'))
  words = text.split()
  filtered_words = [word for word in words if word.lower() not in stop_words]
  return " ".join(filtered_words)

# text_without_stop_words = []
# for text in text_column:
#   text_without_stop_words.append(remove_stop_words(str(text)))

# print(text_without_stop_words)

In [41]:
new_text = []
for text in text_column:
  new_text.append(remove_stop_words(remove_special_characters(to_lower(str(text)))))

print(new_text)

Output hidden; open in https://colab.research.google.com to view.

In [42]:
tokenized_text = []
for text in new_text:
  tokens =  nltk.word_tokenize(text)
  tokenized_text.append(tokens)

#print(tokenized_text)

In [43]:
def create_vocabulary(tokenized_text):
  vocabulary = set()
  for tokens in tokenized_text:
    for token in tokens:
      vocabulary.add(token)
  return vocabulary

#print(create_vocabulary(tokenized_text))
vocabulary = create_vocabulary(tokenized_text)

In [47]:
def document_to_bow(tokens, vocabulary):
    bow_vector = np.zeros(len(vocabulary))
    vocabulary_list = list(vocabulary)
    vocabulary_dict = {token: index for index, token in enumerate(vocabulary_list)}
    for token in tokens:
        if token in vocabulary_dict:
            bow_vector[vocabulary_dict[token]] += 1
    return bow_vector

train_text, test_text, train_label, test_label = train_test_split(
    tokenized_text, df['category'], test_size=0.2, random_state=42
)

bow_train = np.array([document_to_bow(tokens, vocabulary) for tokens in train_text])
bow_test = np.array([document_to_bow(tokens, vocabulary) for tokens in test_text])

print(bow_train[0])
print(bow_test[0])

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [52]:
def calculate_tf(tokens):
    tf_dict = {}
    total_terms = len(tokens)
    for token in tokens:
        if token in tf_dict:
            tf_dict[token] += 1
        else:
            tf_dict[token] = 1
    for token in tf_dict:
        tf_dict[token] /= total_terms
    return tf_dict

In [53]:
def calculate_idf(corpus):
    idf_dict = {}
    total_documents = len(corpus)
    for tokens in corpus:
        unique_tokens = set(tokens)
        for token in unique_tokens:
            if token in idf_dict:
                idf_dict[token] += 1
            else:
                idf_dict[token] = 1
    for token in idf_dict:
        idf_dict[token] = np.log(total_documents / idf_dict[token])
    return idf_dict

In [54]:
def calculate_tfidf(corpus):
    tfidf_vectors = []
    idf = calculate_idf(corpus)

    for tokens in corpus:
        tf = calculate_tf(tokens)
        tfidf_vector = {}
        for token, tf_value in tf.items():
            tfidf_vector[token] = tf_value * idf.get(token, 0)
        tfidf_vectors.append(tfidf_vector)

    return tfidf_vectors

In [55]:
def analyze_tfidf(tfidf_vectors, labels):
    category_tfidf = {}

    for i, vector in enumerate(tfidf_vectors):
        category = labels.iloc[i]
        if category not in category_tfidf:
            category_tfidf[category] = []
        category_tfidf[category].append(vector)

    for category, vectors in category_tfidf.items():
        avg_tfidf = {}
        for vector in vectors:
            for token, score in vector.items():
                if token in avg_tfidf:
                    avg_tfidf[token].append(score)
                else:
                    avg_tfidf[token] = [score]

        avg_tfidf_scores = {token: np.mean(scores) for token, scores in avg_tfidf.items()}
        top_words = sorted(avg_tfidf_scores.items(), key=lambda item: item[1], reverse=True)[:10]
        print(f"Top 10 words for category '{category}': {top_words}")

    for category, vectors in category_tfidf.items():
        for vector in vectors:
            for token, score in vector.items():
                tf_value = calculate_tf([token])[token]
                idf_value = np.log(len(tfidf_vectors) / sum(1 for v in vectors if token in v))
                if tf_value > 0.1 and idf_value < 1:
                    print(f"High TF, Low IDF: Token: {token}, TF: {tf_value}, IDF: {idf_value}")
                elif tf_value < 0.1 and idf_value > 1:
                    print(f"Low TF, High IDF: Token: {token}, TF: {tf_value}, IDF: {idf_value}")

In [57]:
tfidf_vectors = calculate_tfidf(tokenized_text)
analyze_tfidf(tfidf_vectors, df['category'])

Top 10 words for category 'tech': [('uwb', 0.3286088141535323), ('p2p', 0.3120081500654701), ('posters', 0.2981444038226519), ('espn', 0.2946989956758954), ('raskin', 0.29269033650381043), ('argonaut', 0.2771294808725241), ('yoran', 0.2740448780302343), ('lifts', 0.2696920106425332), ('jacobsen', 0.26425756095772596), ('commodore', 0.25792569837241786)]
Top 10 words for category 'business': [('metlife', 0.536174761363502), ('rossignol', 0.5089866543603998), ('ssl', 0.502663838778283), ('nestle', 0.45787201156041624), ('wipro', 0.45338307027060826), ('7e7', 0.4446641650730966), ('kronor', 0.437305656431225), ('feta', 0.43564199360784533), ('peoplesoft', 0.42391317070301876), ('cocoa', 0.4166222807892076)]
Top 10 words for category 'sport': [('curbishley', 0.4972588512645381), ('ivanovic', 0.4878172275063507), ('cantona', 0.4861441098839878), ('soderling', 0.4587804877738298), ('solskjaer', 0.41260970670825853), ('newry', 0.408731707289412), ('goldfine', 0.4056585365579126), ('koubek', 0