In [None]:
import pandas as pd
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support

# Load the sentence polarity data
nltk.download('sentence_polarity')
from nltk.corpus import sentence_polarity

# Load the sentence polarity dataset
sentences = list(sentence_polarity.sents())
random.shuffle(sentences)

# Create a list of documents with the sentence and label
documents = [(sent, 'pos' if cat=='pos' else 'neg') for cat in sentence_polarity.categories()
            for sent in sentence_polarity.sents(categories=cat)]

# Get the most common words as features
all_words = [word.lower() for (sent,cat) in documents for word in sent]
all_word_freq = nltk.FreqDist(all_words)
word_features = list(all_word_freq.keys())[:2000]


# this list of negation words includes some "approximate negators" like hardly and rarely
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']


# Define document features using bag of words
def document_features(document, word_features):
    document_words = set([word.lower() for word in document])
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features



def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features



# Extract features for each document
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
labels = [c for (d,c) in featuresets]
features = [d for (d,c) in featuresets]




# Extract features for each document
# define the feature sets
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in documents]
labels_stopWord = [c for (d,c) in NOT_featuresets]
features_stopWord = [d for (d,c) in NOT_featuresets]


# K-fold cross validation
kfold = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
accuracies = []
for train_idx, test_idx in kfold.split(features, labels):

    # Split data
    train_features, test_features = [features[i] for i in train_idx], [features[i] for i in test_idx]
    train_labels, test_labels = [labels[i] for i in train_idx], [labels[i] for i in test_idx]

    # Train classifier
    train_set = list(zip(train_features, train_labels))
    test_set = list(zip(test_features, test_labels))
    model = nltk.NaiveBayesClassifier.train(train_set)

    # Predict on test set
    predictions = model.classify_many(test_features)

    # Compute accuracy
    accuracy = nltk.classify.accuracy(model, test_set)
    accuracies.append(accuracy)

    # Map string labels to numbers
    label_map = {'pos': 1, 'neg': 0}

    y_true = [label_map[y] for y in test_labels]
    y_pred = [label_map[y] for y in predictions]

    # Evaluate scores
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

# Print average scores
print("Average Accuracy of Unigram:", sum(accuracies)/len(accuracies))
print("Average Precision of Unigram:", sum(precisions)/len(precisions))
print("Average Recall of Unigram:", sum(recalls)/len(recalls))
print("Average F1 Score of Unigram:", sum(f1s)/len(f1s))






# K-fold cross validation
kfold = KFold(n_splits=5)
precisions = []
recalls = []
f1s = []
accuracies = []
for train_idx, test_idx in kfold.split(features_stopWord, labels_stopWord):

    # Split data
    train_features, test_features = [features_stopWord[i] for i in train_idx], [features_stopWord[i] for i in test_idx]
    train_labels, test_labels = [labels_stopWord[i] for i in train_idx], [labels_stopWord[i] for i in test_idx]

    # Train classifier
    train_set = list(zip(train_features, train_labels))
    test_set = list(zip(test_features, test_labels))
    model = nltk.NaiveBayesClassifier.train(train_set)

    # Predict on test set
    predictions = model.classify_many(test_features)

    # Compute accuracy
    accuracy = nltk.classify.accuracy(model, test_set)
    accuracies.append(accuracy)

    # Map string labels to numbers
    label_map = {'pos': 1, 'neg': 0}

    y_true = [label_map[y] for y in test_labels]
    y_pred = [label_map[y] for y in predictions]

    # Evaluate scores
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

# Print average scores
print("Average Accuracy of Negation:", sum(accuracies)/len(accuracies))
print("Average Precision of Negation:", sum(precisions)/len(precisions))
print("Average Recall of Negation:", sum(recalls)/len(recalls))
print("Average F1 Score of Negation:", sum(f1s)/len(f1s))






# Load and preprocess Fake and True data
from google.colab import drive
drive.mount("/drive", force_remount=True)
import csv
import pandas as pd
# Read true data
real_news = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/True.csv')
# Read fake data
fake_news = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/Fake.csv')



# Analyze first 50 fake news articles for negation
fake_analysis_negation = []
for index, row in fake_news[:50].iterrows():
    pos_count = 0
    neg_count = 0
    for sentence in row['text'].split('.'):
        if model.classify(NOT_features(sentence.split(), word_features, negationwords)) == 'pos':
            pos_count += 1
        else:
            neg_count += 1
    fake_analysis_negation.append([row['text'], pos_count, neg_count])



# Write Negation unigram fake news analysis to CSV
header = ['text', 'the number of positive sentences in text', 'the number of negative sentences in text']
with open('fake_analysis_negation.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(fake_analysis_negation)




# Analyze first 50 real news articles for negation
real_analysis_negation = []
for index, row in real_news[:50].iterrows():
    pos_count = 0
    neg_count = 0
    for sentence in row['text'].split('.'):
        if model.classify(NOT_features(sentence.split(), word_features , negationwords)) == 'pos':
            pos_count += 1
        else:
            neg_count += 1
    real_analysis_negation.append([row['text'], pos_count, neg_count])



# Write negation real news analysis to CSV
with open('real_analysis_negation.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(header)
   writer.writerows(real_analysis_negation)




[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Average Accuracy of Unigram: 0.6160308526162185
Average Precision of Unigram: 0.5376824034334764
Average Recall of Unigram: 0.40304847277439687
Average F1 Score of Unigram: 0.4534600715617726


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Average Accuracy of Negation: 0.7354302398914935
Average Precision of Negation: 0.5453947368421053
Average Recall of Negation: 0.48671135251472186
Average F1 Score of Negation: 0.511063721852422
Mounted at /drive


Based on comparing the evaluation metrics between the unigram and negation models, we can make a few observations:
The negation model has higher accuracy than the unigram model (0.735 vs 0.616). Adding explicit negation features improved accuracy.
Precision is slightly higher for unigrams, while recall is better for negation. This indicates negation reduced some false positives but at the cost of fewer true positives.
The F1 score, which balances precision and recall, is higher for the negation model compared to unigrams (0.511 vs 0.453). The overall F1 improvement shows the benefits of modeling negation outweigh the precision drop.

In summary, the negation model leads to large gains in accuracy and modest gains in F1 score. This suggests explicitly handling negation sentiment reversals is beneficial for this dataset, despite a small reduction in precision. The accuracy and F1 improvements show the advantages of adding negation features compared to just unigram bag-of-words.

------------------------------------------------------------

I chose to use the sentence polarity corpus from NLTK for this sentiment analysis assignment for a few key reasons:

It provides a nicely preprocessed sentiment dataset for training and evaluation. The sentences are labeled with positive/negative sentiment tags.

The sentences come from movie reviews so they represent realistic language use and opinions versus synthetic data.

It has a good size for experimentation - over 10,000 sentences split between train and test sets. Large enough for meaningful results.

The domain of movie reviews is relevant for common sentiment analysis applications around entertainment, product reviews, etc.

Some key advantages this dataset provides:

Realistic language examples for training sentiment classifiers.

Cleanly labeled data for supervised learning.

Domain-specific text from movie reviews.

Relevance to common sentiment analysis applications.

The sentence polarity corpus has been a great way to quickly get up and running with core sentiment analysis techniques. The dataset quality and characteristics make it very suitable for this type of analysis.

In [None]:


# Load CSV results
fake_negation = pd.read_csv('/content/fake_analysis_negation.csv')
real_negation = pd.read_csv('/content/real_analysis_negation.csv')


# Compare positive and negative sentences
print("Average Pos Sentences:")
print("Fake:", fake_negation['the number of positive sentences in text'].mean())
print("Real:", real_negation['the number of positive sentences in text'].mean())

print("\nAverage Neg Sentences:")
print("Fake:", fake_negation['the number of negative sentences in text'].mean())
print("Real:", real_negation['the number of negative sentences in text'].mean())

# Explanation and discussion



Average Pos Sentences:
Fake: 8.42
Real: 9.3

Average Neg Sentences:
Fake: 13.48
Real: 16.66


Based on the output showing the average positive and negative sentences in the fake vs real news CSVs, we can make a few observations:
The real news contains more positive sentences on average than the fake news (9.3 vs 8.42)
The real news contains significantly more negative sentences on average compared to the fake news (16.66 vs 13.48).
This suggests:
The fake news articles use less positive sentiment language overall compared to real news. This could indicate fake news tends to be more neutral in tone.
The fake news articles contain much less negative sentiment language on average. The lower amount of negative sentences could suggest fake news avoids strong negative language.
The combination of less positive and fewer negative sentences makes the fake news sentiment more neutral compared to real news.
So in summary, the fake news sentiment appears more neutral and avoided extremes of positive or negative sentiment based on this analysis. This could help fake news avoid seeming too over-the-top or emotional compared to real reporting.

In [None]:
# Top 20 rows of the csv files

df1 = pd.read_csv('/content/fake_analysis_negation.csv')
print(df1.head(20))

                                                 text  \
0   Donald Trump just couldn t wish all Americans ...   
1   House Intelligence Committee Chairman Devin Nu...   
2   On Friday, it was revealed that former Milwauk...   
3   On Christmas day, Donald Trump announced that ...   
4   Pope Francis used his annual Christmas Day mes...   
5   The number of cases of cops brutalizing and ki...   
6   Donald Trump spent a good portion of his day a...   
7   In the wake of yet another court decision that...   
8   Many people have raised the alarm regarding th...   
9   Just when you might have thought we d get a br...   
10  A centerpiece of Donald Trump s campaign, and ...   
11  Republicans are working overtime trying to sel...   
12  Republicans have had seven years to come up wi...   
13  The media has been talking all day about Trump...   
14  Abigail Disney is an heiress with brass ovarie...   
15  Donald Trump just signed the GOP tax scam into...   
16  A new animatronic figure in

In [None]:
df2 = pd.read_csv('/content/real_analysis_negation.csv')
print(df2.head(20))

                                                 text  \
0   WASHINGTON (Reuters) - The head of a conservat...   
1   WASHINGTON (Reuters) - Transgender people will...   
2   WASHINGTON (Reuters) - The special counsel inv...   
3   WASHINGTON (Reuters) - Trump campaign adviser ...   
4   SEATTLE/WASHINGTON (Reuters) - President Donal...   
5   WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...   
6   WEST PALM BEACH, Fla (Reuters) - President Don...   
7   The following statements were posted to the ve...   
8   The following statements were posted to the ve...   
9   WASHINGTON (Reuters) - Alabama Secretary of St...   
10  (Reuters) - Alabama officials on Thursday cert...   
11  NEW YORK/WASHINGTON (Reuters) - The new U.S. t...   
12  The following statements were posted to the ve...   
13  The following statements were posted to the ve...   
14   (In Dec. 25 story, in second paragraph, corre...   
15  (Reuters) - A lottery drawing to settle a tied...   
16  WASHINGTON (Reuters) - A Ge

------------------------------------------------------------


Unigram Model
This model uses a bag-of-words representation with unigram features.
The features are individual words from the sentences.
Each sentence is represented by a feature vector indicating the presence/absence of the top 2000 most frequent words.
No preprocessing is done - all words are used as features as-is.

Negation Model
This model builds on top of the unigram features.
Additional negation features are added to capture negation.
Negation words like "not", "no", etc. are defined.
When a negation word appears, the next word is marked as negated.
So "not good" would have the "good" feature negated.
This aims to handle negation more explicitly.

The negation model should help improve performance by better encoding sentiment reversing negation words.

------------------------------------------------------------



There is no data pre-processing tasks to be conducted in Sentiment Analysis. Words should not be convereted into lowercase to preserve capitalized words. Removal od non-words also should not be done to preserve punctuation.Removal of stop words can be done separately as a feature set itself. However, tokenization should be done.

------------------------------------------------------------



Here are some key things I learned from working on this sentiment analysis assignment:

Preprocessing like normalization and handling negation is very important for sentiment analysis. My initial bag-of-words model would have benefited from more preprocessing.

Adding features like negation handling can significantly improve model performance for sentiment classification, despite using a simple Naive Bayes classifier.

Validation techniques like k-fold cross-validation are essential for properly evaluating and comparing models.
Sentiment analysis provides an interesting way to compare qualities like positive/negative language in different text sources like real vs fake news.

There are lots of options for enhancing sentiment classifiers like utilizing lexicons, handling stop words, handling negation scope, etc. Lots of room to experiment.
Sentiment analysis has many practical applications in fields like marketing, social media monitoring, and reviewing systems. It can provide business insights.

There are some key challenges like understanding context, sarcasm, ambiguous language that require more advanced techniques.

Overall, it was great hands-on practice for sentiment analysis techniques. I have a much better understanding now of building, evaluating, and applying these types of models.
