In [1]:
# Creating example documents
doc_1 = "A whopping 96.5 percent of water on Earth is in our oceans, covering 71 percent of the surface of our planet. And at any given time, about 0.001 percent is floating above us in the atmosphere. If all of that water fell as rain at once, the whole planet would get about 1 inch of rain."

doc_2 = "One-third of your life is spent sleeping. Sleeping 7-9 hours each night should help your body heal itself, activate the immune system, and give your heart a break. Beyond that--sleep experts are still trying to learn more about what happens once we fall asleep."

doc_3 = "A newborn baby is 78 percent water. Adults are 55-60 percent water. Water is involved in just about everything our body does."

doc_4 = "While still in high school, a student went 264.4 hours without sleep, for which he won first place in the 10th Annual Great San Diego Science Fair in 1964."

doc_5 = "We experience water in all three states: solid ice, liquid water, and gas water vapor."

# Create corpus
corpus = [doc_1, doc_2, doc_3, doc_4, doc_5]

In [2]:
# Code source: https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')  
nltk.download('omw-1.4')  
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = [clean(doc).split() for doc in corpus]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
print(clean_corpus)

[['whopping', '965', 'percent', 'water', 'earth', 'ocean', 'covering', '71', 'percent', 'surface', 'planet', 'given', 'time', '0001', 'percent', 'floating', 'u', 'atmosphere', 'water', 'fell', 'rain', 'once', 'whole', 'planet', 'would', 'get', '1', 'inch', 'rain'], ['onethird', 'life', 'spent', 'sleeping', 'sleeping', '79', 'hour', 'night', 'help', 'body', 'heal', 'itself', 'activate', 'immune', 'system', 'give', 'heart', 'break', 'beyond', 'thatsleep', 'expert', 'still', 'trying', 'learn', 'happens', 'fall', 'asleep'], ['newborn', 'baby', '78', 'percent', 'water', 'adult', '5560', 'percent', 'water', 'water', 'involved', 'everything', 'body', 'doe'], ['still', 'high', 'school', 'student', 'went', '2644', 'hour', 'without', 'sleep', 'first', 'place', '10th', 'annual', 'great', 'san', 'diego', 'science', 'fair', '1964'], ['experience', 'water', 'three', 'state', 'solid', 'ice', 'liquid', 'water', 'gas', 'water', 'vapor']]


In [None]:
from gensim import corpora

# Creating document-term matrix 
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [None]:
print(doc_term_matrix)

In [None]:
from gensim.models import LdaModel

# LDA model
lda = LdaModel(doc_term_matrix, num_topics=3, id2word = dictionary)

# Results
print(lda.print_topics(num_topics=3, num_words=3))

"""
[
(0, '0.071*"water" + 0.025*"state" + 0.025*"three"'), 
(1, '0.030*"still" + 0.028*"hour" + 0.026*"sleeping"'), 
(2, '0.073*"percent" + 0.069*"water" + 0.031*"rain"')
]
"""

In [None]:
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint

# Example news articles dataset
news_articles = [
    "President Biden signed an executive order aimed at promoting competition in the US economy. The order includes measures to tackle corporate consolidation, improve competition in labor markets, and reduce barriers to entry for small businesses.",
    
    "Scientists have discovered a new species of butterfly in the Amazon rainforest. The butterfly, named Morpho helenor, has iridescent blue wings and is the first new species of butterfly found in the region in over a decade.",
    
    "The stock market experienced a sharp decline today, with major indices falling by over 2%. Analysts attribute the drop to concerns over rising inflation and the Federal Reserve's plans to tighten monetary policy.",
    
    "A new study suggests that regular exercise may help to reduce the risk of developing Alzheimer's disease. The study, published in the Journal of Neurology, found that people who engaged in physical activity had lower levels of amyloid-beta, a protein associated with Alzheimer's.",
    
    "Tesla CEO Elon Musk announced plans to build a new Gigafactory in Texas. The factory, which will produce batteries for electric vehicles, is expected to create thousands of jobs and further expand Tesla's manufacturing capacity."
]

# Tokenize the news articles (split into words)
tokenized_articles = [article.lower().split() for article in news_articles]

# Create a dictionary from the tokenized articles
dictionary = corpora.Dictionary(tokenized_articles)

# Create a corpus where each document is represented as a bag-of-words
corpus = [dictionary.doc2bow(article) for article in tokenized_articles]

# Specify the number of topics
num_topics = 3

# Build the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

# Print the topics and their top words
pprint(lda_model.print_topics())


In [None]:
# New article
new_article = "A study reveals the benefits of meditation for mental health. Researchers found that regular meditation practice can reduce stress and anxiety levels."

# Tokenize the new article
tokenized_new_article = new_article.lower().split()

# Convert the tokenized new article into bag-of-words representation
new_article_bow = dictionary.doc2bow(tokenized_new_article)

# Infer the topic distribution for the new article
topic_distribution = lda_model.get_document_topics(new_article_bow)

# Assign the new article to the topic with the highest probability
max_prob_topic = max(topic_distribution, key=lambda x: x[1])

# Print the assigned topic
print("Assigned Topic:", max_prob_topic[0])


In [None]:
from gensim import corpora
from gensim.models import LdaModel

# Example news articles dataset
news_articles = [
    "President Biden signed an executive order aimed at promoting competition in the US economy. The order includes measures to tackle corporate consolidation, improve competition in labor markets, and reduce barriers to entry for small businesses.",
    "Scientists have discovered a new species of butterfly in the Amazon rainforest. The butterfly, named Morpho helenor, has iridescent blue wings and is the first new species of butterfly found in the region in over a decade.",
    "The stock market experienced a sharp decline today, with major indices falling by over 2%. Analysts attribute the drop to concerns over rising inflation and the Federal Reserve's plans to tighten monetary policy.",
    "A new study suggests that regular exercise may help to reduce the risk of developing Alzheimer's disease. The study, published in the Journal of Neurology, found that people who engaged in physical activity had lower levels of amyloid-beta, a protein associated with Alzheimer's.",
    "Tesla CEO Elon Musk announced plans to build a new Gigafactory in Texas. The factory, which will produce batteries for electric vehicles, is expected to create thousands of jobs and further expand Tesla's manufacturing capacity."
]

# Tokenize the news articles
tokenized_articles = [article.lower().split() for article in news_articles]

# Create a dictionary from the tokenized articles
dictionary = corpora.Dictionary(tokenized_articles)

# Create a bag-of-words representation of each article
corpus = [dictionary.doc2bow(article) for article in tokenized_articles]

# Train an LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, passes=10)

# Define the labels for each topic
topic_labels = {
    0: "politics",
    1: "economy",
    2: "tech"
}

# Assign topics to articles and output the results
for i, article_bow in enumerate(corpus):
    topic_distribution = lda_model.get_document_topics(article_bow)
    max_prob_topic = max(topic_distribution, key=lambda x: x[1])[0]
    assigned_topic = topic_labels[max_prob_topic]
    
    print(f"Article {i+1}: Topic - {assigned_topic}")
    print("Top 5 words:", [word for word, prob in lda_model.show_topic(max_prob_topic, topn=5)])
    print("Article:", news_articles[i])
    print()


In [None]:
from gensim import corpora, models

# Assume `documents` is your list of news articles
texts = [[word for word in document.lower().split()] for document in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA
lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Extract top 5 words for each topic
topics = lda.print_topics(num_words=5)

for topic in topics:
    print(topic)


In [None]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, stem_text
from gensim import corpora, models

# Load the CSV file
file_path = 'D://APU Degree//Degree Sem 4//Investigations in Data Analytics//Assignment Work Done//3 IR Report//Dataset//Articles.csv'
articles_df = pd.read_csv(file_path)

# Define a preprocessing function that applies multiple preprocessing steps
def preprocess_text(text):
    custom_filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, stem_text]
    return preprocess_string(text, custom_filters)

# Apply preprocessing to each article
articles_df['processed_text'] = articles_df['Article text'].apply(preprocess_text)

# Create a dictionary from the processed text
dictionary = corpora.Dictionary(articles_df['processed_text'])

# Filter out extremes to remove too frequent and too rare words
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in articles_df['processed_text']]

# Running LDA model
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

# Display the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


In [None]:
def preprocess_and_categorize_new_document(new_document):
    # Preprocess the new document
    processed_doc = preprocess_text(new_document)
    
    # Convert to document-term matrix
    doc_bow = dictionary.doc2bow(processed_doc)
    
    # Use the LDA model to get the topic distribution
    doc_topics = lda_model.get_document_topics(doc_bow)
    
    # Sort the topics by probability
    doc_topics_sorted = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    
    # Get the most dominant topic
    dominant_topic = doc_topics_sorted[0] if doc_topics_sorted else None
    
    return dominant_topic

# Example usage with a new document
new_document = "The latest international news, featuring top stories from around the world and breaking news, as it happens."
dominant_topic = preprocess_and_categorize_new_document(new_document)

# Print the most dominant topic and its probability
print(f"Dominant Topic: {dominant_topic}")


In [None]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, stem_text
from gensim import corpora, models

# Load the CSV file
file_path = 'D://APU Degree//Degree Sem 4//Investigations in Data Analytics//Assignment Work Done//3 IR Report//Dataset//Articles.csv'
articles_df = pd.read_csv(file_path)

# Define a preprocessing function
def preprocess_text(text):
    custom_filters = [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, stem_text]
    return preprocess_string(text, custom_filters)

# Apply preprocessing
articles_df['processed_text'] = articles_df['Article text'].apply(preprocess_text)

# Create a dictionary
dictionary = corpora.Dictionary(articles_df['processed_text'])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in articles_df['processed_text']]

# Run LDA
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Function to preprocess and categorize new documents
def preprocess_and_categorize_new_document(new_document):
    processed_doc = preprocess_text(new_document)
    doc_bow = dictionary.doc2bow(processed_doc)
    doc_topics = lda_model.get_document_topics(doc_bow)
    doc_topics_sorted = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    dominant_topic = doc_topics_sorted[0] if doc_topics_sorted else None
    return dominant_topic

# Example of categorizing a new document
new_document = "The latest international news, featuring top stories from around the world and breaking news, as it happens."
dominant_topic = preprocess_and_categorize_new_document(new_document)

# Print the most dominant topic and its probability
print(f"Dominant Topic: {dominant_topic}")


In [None]:
import pandas as pd

# Assuming your dataset is loaded into a DataFrame df
# Replace 'path_to_your_dataset.csv' with your actual file path or use pd.read_excel() for Excel files
# df = pd.read_csv('path_to_your_dataset.csv')

# Example DataFrame creation (replace with your actual data loading code)
data = {
    'text': ['cnn breaking news', 'some text with cnn mentioned', 'no cnn here'],
    'headline': ['headline 1', 'headline 2', 'headline 3'],
    'category': ['news', 'news', 'other']
}
df = pd.DataFrame(data)

# Function to remove 'cnn' from the 'text' column
def remove_cnn(text):
    return text.replace('cnn', '')

# Apply the function to the 'text' column
df['text'] = df['text'].apply(remove_cnn)

# Display the modified DataFrame
print(df)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Sample headlines
headlines = [
    "Stocks soar to record highs",
    "Company reports disappointing earnings",
    "New product launch receives mixed reviews",
    "Economic outlook remains uncertain",
    "Innovative technology set to disrupt industry",
    "Market volatility causes investor anxiety",
    "Company announces major breakthrough",
    "Natural disaster impacts local economy",
    "Positive quarter boosts investor confidence",
    "Regulatory changes create market uncertainty"
]

# Create a DataFrame
df = pd.DataFrame(headlines, columns=['headline'])

# Preprocess text
def preprocess_text(text):
    return text.lower()

df['headline'] = df['headline'].apply(preprocess_text)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['headline'])

# Apply K-means clustering
num_clusters = 3  # We aim for 3 clusters: positive, neutral, negative
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# Function to create word cloud for each cluster
def plot_word_cloud(cluster_num):
    text = " ".join(df[df['cluster'] == cluster_num]['headline'].values)
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Cluster {cluster_num}")
    plt.show()

# Plot word clouds for each cluster
for cluster_num in range(num_clusters):
    plot_word_cloud(cluster_num)

# Manually label clusters based on word clouds
cluster_labels = {0: 'positive', 1: 'neutral', 2: 'negative'}
df['sentiment'] = df['cluster'].map(cluster_labels)

# Display the labeled data
print(df[['headline', 'sentiment']])


In [7]:
#install needed packages
!pip install snorkel
!pip install textblob
#import libraries and modules
import io
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Collecting snorkel
  Using cached snorkel-0.10.0-py3-none-any.whl (103 kB)
Installing collected packages: snorkel
Successfully installed snorkel-0.10.0






[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# #uplaod the data from your local directory
# uploaded = files.upload()
# # store the dataset as a Pandas Dataframe
# df = pd.read_csv(io.BytesIO(uploaded['data.csv']))
# #conduct some data cleaning
# df = df.drop(['publish_date', 'Unnamed: 2'], axis=1)
# df = df.rename(columns = {'headline_text': 'text'})
# df['text'] = df['text'].astype(str)
# #check the data info
# df.info()


# load the dataset
df = pd.read_csv('Cleaned_News_Articles_Final2.csv')
df['headline'] = df['headline'].astype(str)


In [9]:
#define constants to represent the class labels :positive, negative, and abstain
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1
#define function which looks into the input words to represent a proper label
def keyword_lookup(x, keywords, label):  
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN
#define function which assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))
#resource: https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions
#these two lists can be further extended 
"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(keywords=['boosts', 'great', 'develops', 'promising', 'ambitious', 'delighted', 'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 'party', 'hope', 'flourish', 'respect', 'partnership', 'champion', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured' ])
"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(keywords=['war','solidiers', 'turmoil', 'injur','trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=NEGATIVE)

In [10]:
#set up a preprocessor function to determine polarity & subjectivity using textlob pretrained classifier 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [11]:
#combine all the labeling functions 
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]
#apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#apply the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

100%|██████████| 41795/41795 [02:48<00:00, 248.44it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=2.663]
  9%|▉         | 9/100 [00:00<00:01, 88.25epoch/s]INFO:root:[10 epochs]: TRAIN:[loss=0.923]
INFO:root:[20 epochs]: TRAIN:[loss=0.341]
INFO:root:[30 epochs]: TRAIN:[loss=0.006]
INFO:root:[40 epochs]: TRAIN:[loss=0.044]
INFO:root:[50 epochs]: TRAIN:[loss=0.014]
INFO:root:[60 epochs]: TRAIN:[loss=0.000]
INFO:root:[70 epochs]: TRAIN:[loss=0.002]
 71%|███████   | 71/100 [00:00<00:00, 396.19epoch/s]INFO:root:[80 epochs]: TRAIN:[loss=0.001]
INFO:root:[90 epochs]: TRAIN:[loss=0.000]
100%|██████████| 100/100 [00:00<00:00, 406.53epoch/s]
INFO:root:Finished Training


In [12]:
df.to_csv("test.csv", index=False)

In [13]:
# print each unique counts of the column (target values)
print("Unique count in category column:")
print(df['label'].value_counts())

Unique count in category column:
 0    37700
 1     3349
-1      746
Name: label, dtype: int64


In [None]:
#Filtering out unlabeled data points
df= df.loc[df.label.isin([0,1]), :]
#find the label counts 
df['label'].value_counts()

In [2]:
# Import necessary libraries
!pip install vaderSentiment

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Sample data (replace this with your actual data)
data = {
    'text': [
        'I love this product! It works great and is very affordable.',
        'This is the worst experience I have ever had.',
        'The product is okay, not too bad but not great either.',
        'Absolutely fantastic! Exceeded my expectations.',
        'I am not sure how I feel about this, it’s just average.',
        'Terrible! I will never buy this again.'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Define a function to classify sentiment
def classify_sentiment(text):
    # Get the sentiment scores
    scores = analyzer.polarity_scores(text)
    # Get the compound score
    compound = scores['compound']
    # Classify the sentiment based on the compound score
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment classification function to the DataFrame
df['sentiment'] = df['text'].apply(classify_sentiment)

# Display the DataFrame with sentiment labels
print(df)


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
                                              0.0/126.0 kB ? eta -:--:--
     -------------------------------------  122.9/126.0 kB 2.4 MB/s eta 0:00:01
     -------------------------------------- 126.0/126.0 kB 1.8 MB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
                                                text sentiment
0  I love this product! It works great and is ver...  positive
1      This is the worst experience I have ever had.  negative
2  The product is okay, not too bad but not great...  negative
3    Absolutely fantastic! Exceeded my expectations.  positive
4  I am not sure how I feel about this, it’s just...  negative
5             Terrible! I will never buy this again.  negative




In [5]:
print(df['sentiment'].value_counts())

positive    37192
neutral      2546
negative     2057
Name: sentiment, dtype: int64


In [12]:
import nltk
nltk.download('words')
from nltk.corpus import words
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import pandas as pd

# load the dataset
df = pd.read_csv('aug_final.csv')

# tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the text column in the DataFrame
df['text'] = df['text'].apply(tokenize_text)

# Remove any non-English words
english_words = set(words.words()) 
def remove_non_english(tokens):
    english_tokens = []
    for word in tokens:
        if word in english_words:
            english_tokens.append(word)
        else:
            english_tokens.append('')
    return [token for token in english_tokens if token != '']
df['text'] = df['text'].apply(remove_non_english)

df['text'] = df['text'].apply(lambda tokens: ' '.join(tokens))

df.to_csv("aug_final.csv", index=False)


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ChokJoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import pandas as pd

# load dataset
df = pd.read_csv('augted_test.csv')  

# select 10000 data for each sentiment
lower_limit = 100
upper_limit = 1000

# grp the data by sentiment and perform downsampling within each grp
downsampled_data = df.groupby('category').apply(lambda x: x.sample(n=min(upper_limit, max(lower_limit, len(x))), random_state=42))

downsampled_data = downsampled_data.reset_index(drop=True)

downsampled_data.to_csv('balanced_category_test.csv', index=False)
downsampled_data['category'].value_counts()

  downsampled_data = df.groupby('category').apply(lambda x: x.sample(n=min(upper_limit, max(lower_limit, len(x))), random_state=42))


category
business         1000
entertainment    1000
health           1000
news             1000
opinions         1000
politics         1000
sport            1000
us               1000
world            1000
weather           570
Name: count, dtype: int64

In [None]:
import pandas as pd

# load dataset
df = pd.read_csv('augted_test.csv') 
