In [68]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
import csv 

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer


def clean_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    wordnet_lemmatizer = WordNetLemmatizer()

    words = [wordnet_lemmatizer.lemmatize(word) for word in words]

    # Join the remaining words back into a string
    cleaned_text = " ".join(words)
    
    return cleaned_text

def clean_csv(input_filename, output_filename, column, disable):
    with open(input_filename, 'r', encoding='utf-8') as input_file, open(output_filename, 'w', newline='', encoding='utf-8') as output_file:
        reader = csv.reader(input_file)
        writer = csv.writer(output_file)
        for row in reader:
            if not disable:
              row[column] = clean_text(row[column])
            writer.writerow(row)
  
def keep_columns(csv_file_path, output_file_path):
    f=pd.read_csv(csv_file_path)
    keep_col = ['text','Category']
    new_f = f[keep_col]
    new_f.to_csv(output_file_path, index=False)


def split_csv_file(csv_file_path, test_size=0.5, random_state=42):
    data = pd.read_csv(csv_file_path)
    
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)
    
    return train_data, test_data

def parse_train_data(filepath):
    fp = open(filepath)
    fp.read()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
dataset_path = "/content/sample_data/bbc_dataset.csv"
clean_path = "/content/sample_data/clean.csv"

train_path = "/content/sample_data/train.csv"
test_path = "/content/sample_data/test.csv"

keep_columns("/content/sample_data/bbc_train.csv",dataset_path)

clean_csv(dataset_path,clean_path, 0, disable = True)
train_data, test_data = split_csv_file(clean_path, test_size=0.2)

train_data.to_csv(train_path, index=False)


In [70]:
import math
import pandas as pd
from collections import Counter

def tf_icf_from_csv(csv_file):
    df = pd.read_csv(csv_file)

    # Step 1: Calculate Term Frequency (TF)
    docs = df['text'].tolist()
    tf = Counter([word for doc in docs for word in doc.split()])
    max_tf = max(tf.values()) # Max frequency in the document

    # Step 2: Calculate Inverse Class Frequency (ICF)
    classes = df.groupby('Category')['text'].apply(list).tolist()
    N = len(classes)
    df_term = {term: sum(1 for doc in docs if term in doc) for term in tf}
    icf = {term: math.log(N / df_term[term]) if df_term[term] > 0 else 0 for term in tf}

    # Step 3: Calculate TF-ICF score
    tf_icf = {term: (tf[term] / max_tf) * icf[term] for term in tf}

    return tf_icf

#tf_icf = tf_icf_from_csv(train_path)

In [71]:
tf_icf

{'willis': 0.001371548512498185,
 'sue': -0.017184001147528976,
 'movie': -0.10185119643272382,
 'injury': -0.09815095573129315,
 'actor': -0.1552780548461333,
 'bruce': -0.004300319761496483,
 'suing': -0.0015358284862487441,
 'revolution': -0.0046074854587462325,
 'studio': -0.03522891408484407,
 'said': -5.177279038170981,
 'suffered': -0.022379200175667174,
 'making': -0.12478950278688296,
 'tear': -0.0028729582669728216,
 'sun': -0.03216673959031825,
 'seeking': -0.016257039213640686,
 'medical': -0.008789223879960464,
 'expense': -0.0036564923397835914,
 'hit': -0.22323191961523892,
 'head': -0.10585997188824556,
 'firework': 0.0005467128471802835,
 'filming': -0.00032635123531137456,
 '2002': -0.06047414788192976,
 'produced': -0.022379200175667174,
 'firm': -0.5693341930062104,
 'lawsuit': -0.01214507110477292,
 'star': -0.2710061346742598,
 'endured': -0.0017535401697557247,
 'mental': -0.0056791077186877665,
 'physical': -0.008024943542572598,
 'result': -0.16337971491134462,

In [72]:
import math
import pandas as pd
from collections import Counter

def naive_bayes_tf_icf_train(csv_file):
    # Step 1: Calculate the TF-ICF scores for each term in the training set
    tf_icf_scores = tf_icf_from_csv(csv_file)

    # Step 2: Calculate the class prior probabilities
    df = pd.read_csv(csv_file)
    prior = df['Category'].value_counts(normalize=True).to_dict()

    # Step 3: Calculate the class conditional probabilities
    cond_prob = {}
    for c in prior.keys():
        docs_c = df.loc[df['text'] == c, 'Category'].tolist()
        tf_c = Counter([word for doc in docs_c for word in doc.split()])
        num_c = sum(tf_c.values())
        cond_prob[c] = {}
        for term in tf_icf_scores.keys():
            if term in tf_c:
                tf_icf = tf_icf_scores[term]
                tf = tf_c[term]
                prob = (tf + 1) / (num_c + len(tf_c))
                cond_prob[c][term] = prob * tf_icf
            else:
                cond_prob[c][term] = 0

    return prior, cond_prob

In [73]:
prior, cond_prob = naive_bayes_tf_icf_train(train_path)

In [74]:
prior

{'sport': 0.23741610738255034,
 'business': 0.21895973154362416,
 'entertainment': 0.19043624161073824,
 'politics': 0.18288590604026847,
 'tech': 0.17030201342281878}

In [75]:
import sys
def test_naive_bayes_tf_icf(doc, prior, cond_prob):
    # Step 4: Calculate the posterior probability for each class
    posterior = {}
    for c in prior:
        posterior[c] = math.log(prior[c])
        for term in doc:
            if term in cond_prob[c] :
                posterior[c] += math.log(cond_prob[c][term]+ sys.float_info.epsilon)

    # Step 5: Predict the class with the highest posterior probability
    return max(posterior, key=posterior.get)


In [76]:
score = 0
total = 0
for index, test_case in test_data.iterrows():
  total += 1
  result = test_naive_bayes_tf_icf(test_case['text'], prior, cond_prob)
  if result == test_case['Category']:
    score += 1

print("% correct =", score/total*100)

% correct = 21.140939597315437
