# Data Exploration: Begin by exploring the dataset. What are the different topics/categories 
present in the dataset? What is the distribution of articles across these topics?

In [7]:
# Import necessary libraries
import pandas as pd

# Load the dataset
dataset_path = "/Users/anubhavshroti/Desktop/AI Solutions/NLP day 2 dataset abcnews-date-text.csv"  # Replace with the actual path
df = pd.read_csv(dataset_path)

# Data Exploration
column_names = df.columns
print("Column names in the dataset:", column_names)

# Assuming the first column contains the categories
category_column = column_names[0]
print("\nCategories present in the dataset:")
print(df[category_column].unique())

print("\nDistribution of articles across topics:")
print(df[category_column].value_counts())


Column names in the dataset: Index(['publish_date', 'headline_text'], dtype='object')

Categories present in the dataset:
[20030219 20030220 20030221 ... 20211229 20211230 20211231]

Distribution of articles across topics:
publish_date
20120824    384
20130412    383
20110222    380
20120814    379
20130514    378
           ... 
20210605      6
20211023      5
20210515      5
20210806      1
20170209      1
Name: count, Length: 6882, dtype: int64


# Bag-of-Words (BoW): Implement a Bag-of-Words (BoW) model using CountVectorizer 
or TF-IDF to transform the text data into numerical features. Discuss the advantages and 
limitations of BoW in this context. Apply both unigram and bigram techniques and 
compare their effects on classification accuracy

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
dataset_path = "/Users/anubhavshroti/Desktop/AI Solutions/NLP day 2 dataset abcnews-date-text.csv"  # Replace with the actual path
df = pd.read_csv(dataset_path)

# Create a temporary 'category' column with a placeholder value for demonstration
df['category'] = 'placeholder_category'

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['category'], test_size=0.2, random_state=42)

# Bag-of-Words (BoW) with unigram
vectorizer = CountVectorizer()
X_train_bow_unigram = vectorizer.fit_transform(X_train)
X_test_bow_unigram = vectorizer.transform(X_test)

# Bag-of-Words (BoW) with bigram
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2))
X_train_bow_bigram = vectorizer_bigram.fit_transform(X_train)
X_test_bow_bigram = vectorizer_bigram.transform(X_test)

# TF-IDF with unigram
tfidf_vectorizer_unigram = TfidfVectorizer()
X_train_tfidf_unigram = tfidf_vectorizer_unigram.fit_transform(X_train)
X_test_tfidf_unigram = tfidf_vectorizer_unigram.transform(X_test)

# TF-IDF with bigram
tfidf_vectorizer_bigram = TfidfVectorizer(ngram_range=(2, 2))
X_train_tfidf_bigram = tfidf_vectorizer_bigram.fit_transform(X_train)
X_test_tfidf_bigram = tfidf_vectorizer_bigram.transform(X_test)

# Classification using Naive Bayes
clf = MultinomialNB()

# Function to train and evaluate the classifier
def train_and_evaluate(X_train, X_test, y_train, y_test, vectorizer_type):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {vectorizer_type}: {acc:.4f}")

# Evaluate unigram BoW
train_and_evaluate(X_train_bow_unigram, X_test_bow_unigram, y_train, y_test, "BoW (Unigram)")

# Evaluate bigram BoW
train_and_evaluate(X_train_bow_bigram, X_test_bow_bigram, y_train, y_test, "BoW (Bigram)")

# Evaluate unigram TF-IDF
train_and_evaluate(X_train_tfidf_unigram, X_test_tfidf_unigram, y_train, y_test, "TF-IDF (Unigram)")

# Evaluate bigram TF-IDF
train_and_evaluate(X_train_tfidf_bigram, X_test_tfidf_bigram, y_train, y_test, "TF-IDF (Bigram)")



Accuracy for BoW (Unigram): 1.0000

Accuracy for BoW (Bigram): 1.0000

Accuracy for TF-IDF (Unigram): 1.0000

Accuracy for TF-IDF (Bigram): 1.0000


# N-grams: Explore the use of N-grams (bi-grams, tri-grams) in feature engineering. How do 
different N-gram ranges impact the performance of the classification model?

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
dataset_path = "/Users/anubhavshroti/Desktop/AI Solutions/NLP day 2 dataset abcnews-date-text.csv"  # Replace with the actual path
df = pd.read_csv(dataset_path)

# Create a temporary 'category' column with a placeholder value for demonstration
df['category'] = 'placeholder_category'

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['category'], test_size=0.2, random_state=42)

# Function to train and evaluate the classifier with different N-gram ranges
def train_and_evaluate_ngram(X_train, X_test, y_train, y_test, ngram_range):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X_train_ngram = vectorizer.fit_transform(X_train)
    X_test_ngram = vectorizer.transform(X_test)

    clf = MultinomialNB()
    clf.fit(X_train_ngram, y_train)
    y_pred = clf.predict(X_test_ngram)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {ngram_range}-gram: {acc:.4f}")

# Evaluate unigram
train_and_evaluate_ngram(X_train, X_test, y_train, y_test, (1, 1))

# Evaluate bigram
train_and_evaluate_ngram(X_train, X_test, y_train, y_test, (2, 2))

# Evaluate trigram
train_and_evaluate_ngram(X_train, X_test, y_train, y_test, (3, 3))

# Evaluate a combination of unigram, bigram, and trigram
train_and_evaluate_ngram(X_train, X_test, y_train, y_test, (1, 3))



Accuracy for (1, 1)-gram: 1.0000

Accuracy for (2, 2)-gram: 1.0000

Accuracy for (3, 3)-gram: 1.0000

Accuracy for (1, 3)-gram: 1.0000


# TF-IDF: Apply TF-IDF (Term Frequency-Inverse Document Frequency) to the text data. 
Describe how TF-IDF works and its significance in capturing the importance of words 
across documents. Compare the results of TF-IDF with the BoW approach.

In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
dataset_path = "/Users/anubhavshroti/Desktop/AI Solutions/NLP day 2 dataset abcnews-date-text.csv"  # Replace with the actual path
df = pd.read_csv(dataset_path)

# Create a temporary 'category' column with a placeholder value for demonstration
df['category'] = 'placeholder_category'

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['category'], test_size=0.2, random_state=42)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Bag-of-Words (BoW)
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Classification using Naive Bayes
clf = MultinomialNB()

# Function to train and evaluate the classifier
def train_and_evaluate(X_train, X_test, y_train, y_test, vectorizer_type):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {vectorizer_type}: {acc:.4f}")

# Evaluate TF-IDF
train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, "TF-IDF")

# Evaluate Bag-of-Words
train_and_evaluate(X_train_bow, X_test_bow, y_train, y_test, "Bag-of-Words")



Accuracy for TF-IDF: 1.0000

Accuracy for Bag-of-Words: 1.0000


# One-Hot Encoding: Investigate the application of One-Hot Encoding to encode categorical 
variables or labels. Can One-Hot Encoding be used directly for text classification? Why or 
why not?

In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
dataset_path = "/Users/anubhavshroti/Desktop/AI Solutions/NLP day 2 dataset abcnews-date-text.csv"  # Replace with the actual path
df = pd.read_csv(dataset_path)

# Create a temporary 'category' column with a placeholder value for demonstration
df['category'] = 'placeholder_category'

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['headline_text'], df['category'], test_size=0.2, random_state=42)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Bag-of-Words (BoW)
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Classification using Naive Bayes
clf = MultinomialNB()

# Function to train and evaluate the classifier
def train_and_evaluate(X_train, X_test, y_train, y_test, vectorizer_type):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {vectorizer_type}: {acc:.4f}")

# Evaluate TF-IDF
train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, "TF-IDF")

# Evaluate Bag-of-Words
train_and_evaluate(X_train_bow, X_test_bow, y_train, y_test, "Bag-of-Words")



Accuracy for TF-IDF: 1.0000

Accuracy for Bag-of-Words: 1.0000
