**Sentiment Polarity - Classifier Building**


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import drive
import nltk
nltk.download('words')
drive.mount("/drive", force_remount=True)
import csv
import nltk
nltk.download('stopwords')
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from textblob import TextBlob
import re

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Mounted at /drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/Tweets.csv')
# I chose the Twitter airline data for it's relevance to real-world application. Analyzing sentiment in airline-related tweets can be useful for airlines to understand customer opinions and improve their services.
# With this dataset I can explore different features, preprocessing techniques, and classifiers to fine-tune the sentiment analysis model. This dataset allows me to experiment with various approaches in NLP.

In [9]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [10]:
data.shape

(14640, 15)

In [11]:
#Data pre-processing
data = data.drop_duplicates()
#data = data.dropna() # Removing duplicates and handling missing values
data_required=data[["text","airline_sentiment"]]

In [56]:
# Text Preprocessing
# Cleaning the text data by removing special characters, URLs, mentions, and other noise.
# Tokenize, apply lowercasing, and removing stop words.

In [12]:
documents=[]
for _,row in data_required.iterrows():
    documents.append((re.findall('[a-z]+', row["text"].lower()),row["airline_sentiment"]))

In [13]:
documents[0]

(['virginamerica', 'what', 'dhepburn', 'said'], 'neutral')

In [14]:
all_words_list_baseline = [word for (sent,cat) in documents for word in sent]

In [15]:
#Filter non words and stop words
nltk_words = list(set(nltk.corpus.words.words()))
nltk_words=[x.lower() for x in nltk_words]
nltk_words=set(nltk_words)
stopwords=nltk.corpus.stopwords.words('english')
stopwords=[x.lower() for x in stopwords]
stopwords=stopwords+["united","plane","flight"]
stopwords=set(stopwords)

all_words_list_baseline_filtered=[]
for word in all_words_list_baseline:
    if word not in stopwords and word in nltk_words:
        all_words_list_baseline_filtered.append(word)

In [16]:
all_words_list_baseline_filtered

['said',
 'plus',
 'added',
 'experience',
 'tacky',
 'today',
 'must',
 'mean',
 'need',
 'take',
 'another',
 'trip',
 'really',
 'aggressive',
 'blast',
 'obnoxious',
 'entertainment',
 'little',
 'recourse',
 'really',
 'big',
 'bad',
 'thing',
 'seriously',
 'would',
 'pay',
 'really',
 'bad',
 'thing',
 'flying',
 'yes',
 'nearly',
 'every',
 'time',
 'fly',
 'ear',
 'worm',
 'go',
 'away',
 'really',
 'prime',
 'opportunity',
 'men',
 'without',
 'parody',
 'well',
 'amazing',
 'hour',
 'early',
 'good',
 'know',
 'suicide',
 'second',
 'leading',
 'cause',
 'death',
 'among',
 'teens',
 'pretty',
 'graphics',
 'much',
 'better',
 'minimal',
 'iconography',
 'great',
 'deal',
 'already',
 'thinking',
 'trip',
 'australia',
 'even',
 'gone',
 'st',
 'trip',
 'yet',
 'p',
 'flying',
 'fabulous',
 'seductive',
 'skies',
 'u',
 'take',
 'stress',
 'away',
 'travel',
 'thanks',
 'schedule',
 'still',
 'excited',
 'first',
 'cross',
 'country',
 'lax',
 'nothing',
 'great',
 'virgin',

In [17]:
all_words_freq = nltk.FreqDist(all_words_list_baseline_filtered)
word_items = all_words_freq.most_common(4000)
word_features_baseline = [word for (word,count) in word_items]

In [18]:
word_items

[('get', 1342),
 ('thanks', 1078),
 ('service', 965),
 ('help', 873),
 ('time', 792),
 ('customer', 755),
 ('us', 705),
 ('hold', 641),
 ('thank', 604),
 ('still', 584),
 ('one', 571),
 ('please', 568),
 ('would', 560),
 ('need', 555),
 ('back', 525),
 ('gate', 518),
 ('call', 515),
 ('bag', 491),
 ('hour', 486),
 ('got', 466),
 ('today', 429),
 ('phone', 429),
 ('like', 429),
 ('late', 425),
 ('fly', 391),
 ('waiting', 369),
 ('airport', 362),
 ('u', 359),
 ('know', 358),
 ('way', 355),
 ('trying', 348),
 ('day', 342),
 ('great', 335),
 ('going', 325),
 ('wait', 323),
 ('never', 321),
 ('flying', 315),
 ('change', 315),
 ('make', 315),
 ('weather', 313),
 ('go', 312),
 ('tomorrow', 310),
 ('last', 305),
 ('really', 302),
 ('check', 301),
 ('delay', 301),
 ('good', 297),
 ('home', 296),
 ('even', 294),
 ('aa', 291),
 ('people', 288),
 ('w', 280),
 ('seat', 276),
 ('another', 272),
 ('new', 270),
 ('told', 270),
 ('want', 269),
 ('first', 268),
 ('take', 266),
 ('see', 265),
 ('luggage'

In [19]:
word_features_baseline

['get',
 'thanks',
 'service',
 'help',
 'time',
 'customer',
 'us',
 'hold',
 'thank',
 'still',
 'one',
 'please',
 'would',
 'need',
 'back',
 'gate',
 'call',
 'bag',
 'hour',
 'got',
 'today',
 'phone',
 'like',
 'late',
 'fly',
 'waiting',
 'airport',
 'u',
 'know',
 'way',
 'trying',
 'day',
 'great',
 'going',
 'wait',
 'never',
 'flying',
 'change',
 'make',
 'weather',
 'go',
 'tomorrow',
 'last',
 'really',
 'check',
 'delay',
 'good',
 'home',
 'even',
 'aa',
 'people',
 'w',
 'seat',
 'another',
 'new',
 'told',
 'want',
 'first',
 'take',
 'see',
 'luggage',
 'agent',
 'ticket',
 'love',
 'could',
 'ever',
 'number',
 'getting',
 'worst',
 'lost',
 'yes',
 'due',
 'someone',
 'travel',
 'next',
 'work',
 'min',
 'let',
 'days',
 'crew',
 'much',
 'baggage',
 'flighted',
 'two',
 'trip',
 'made',
 'right',
 'response',
 'experience',
 'sitting',
 'best',
 'sent',
 'staff',
 'bad',
 'boarding',
 'better',
 'long',
 'book',
 'said',
 'already',
 'booked',
 'line',
 'left',
 

In [20]:
#Function to get features
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features


In [21]:
#Create feature set
featuresets = [(document_features(d, word_features_baseline), c) for (d, c) in documents]

In [22]:
featuresets

Output hidden; open in https://colab.research.google.com to view.

In [23]:
len(featuresets)

14604

In [24]:
# Split the data into a training set and a test set
X = [features for (features, label) in featuresets]
y = [label for (features, label) in featuresets]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
from sklearn.feature_extraction import DictVectorizer

# Create a DictVectorizer to convert the feature dictionaries to a numeric format
vectorizer = DictVectorizer(sparse=False)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize and train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)



In [26]:
X = np.array([list(x.values()) for x in X])
cv_scores = cross_val_score(classifier, X, y, cv=5, scoring='accuracy')

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/Tweets.csv')  # Replace with your dataset file path

# Data preprocessing
data['text'] = data['text'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)
data['text'] = data['text'].str.replace('[^a-zA-Z]', ' ')
data['text'] = data['text'].str.lower()

# Label mapping
sentiment_mapping = {'negative': 0, 'neutral': 2, 'positive': 1}
data['label'] = data['airline_sentiment'].map(sentiment_mapping)

# Split the data into training and testing sets
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Model training
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Model evaluation
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


  data['text'] = data['text'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)
  data['text'] = data['text'].str.replace('[^a-zA-Z]', ' ')


Accuracy: 0.7940573770491803
              precision    recall  f1-score   support

           0       0.80      0.96      0.87      1889
           1       0.84      0.59      0.69       459
           2       0.71      0.41      0.52       580

    accuracy                           0.79      2928
   macro avg       0.78      0.65      0.70      2928
weighted avg       0.79      0.79      0.78      2928



**Sentiment Polarity – Analysis of Fake and Real “text”**

In [36]:
import pandas as pd
from textblob import TextBlob
import re
from tqdm import tqdm
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
fake_data = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/Fake.csv')
real_data = pd.read_csv('/drive/My Drive/Colab Notebooks/IST 664/2023/True.csv')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# Display the first 20 rows of each dataset
print("First 20 rows of Fake.csv:")
print(fake_data.head(20))

print("First 20 rows of Real.csv:")
print(real_data.head(20))

First 20 rows of Fake.csv:
                                                title  \
0    Donald Trump Sends Out Embarrassing New Year’...   
1    Drunk Bragging Trump Staffer Started Russian ...   
2    Sheriff David Clarke Becomes An Internet Joke...   
3    Trump Is So Obsessed He Even Has Obama’s Name...   
4    Pope Francis Just Called Out Donald Trump Dur...   
5    Racist Alabama Cops Brutalize Black Boy While...   
6    Fresh Off The Golf Course, Trump Lashes Out A...   
7    Trump Said Some INSANELY Racist Stuff Inside ...   
8    Former CIA Director Slams Trump Over UN Bully...   
9    WATCH: Brand-New Pro-Trump Ad Features So Muc...   
10   Papa John’s Founder Retires, Figures Out Raci...   
11   WATCH: Paul Ryan Just Told Us He Doesn’t Care...   
12   Bad News For Trump — Mitch McConnell Says No ...   
13   WATCH: Lindsey Graham Trashes Media For Portr...   
14   Heiress To Disney Empire Knows GOP Scammed Us...   
15   Tone Deaf Trump: Congrats Rep. Scalise On Los...   
16  

In [53]:
analysis_results = pd.DataFrame(columns=["text", "the number of positive sentences in text", "the number of negative sentences in text"])
#Initializing a DataFrame for analysis results

We used a Bag-of-Words (BoW) approach for feature extraction, which is a simple yet effective method for converting text data into numerical features.
We converted the text data into a BoW representation where each word was treated as a feature. This means that the features are binary, indicating the presence or absence of specific words in the text.
We applied CountVectorizer to create a matrix where rows represent articles, and columns represent words. The value in each cell of the matrix indicates the count of a specific word in the respective article.


In [54]:
# Defining a  function to classify sentences
def classify_sentences(text):
    blob = TextBlob(text)
    positive_count = 0
    negative_count = 0
    for sentence in blob.sentences:
        sentiment = sentence.sentiment.polarity
        if sentiment > 0:
            positive_count += 1
        elif sentiment < 0:
            negative_count += 1
    return positive_count, negative_count

Pre-processing was done by removing stopwords, handled negations, distinguished between phrases like "not good" and "good."
Experimented with using sentiment lexicons that contain scores or counts of words with sentiment values.

In [55]:
# Processing fake news articles
for i in range(50):
    text = fake_data.loc[i, "text"]
    positive_count, negative_count = classify_sentences(text)
    analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)

  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)


In [56]:
# Processing real news articles
for i in range(50):
    text = real_data.loc[i, "text"]
    positive_count, negative_count = classify_sentences(text)
    analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)


  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)
  analysis_results = analysis_results.append({"text": text, "the number of positive sentences in text": positive_count, "the number of negative sentences in text": negative_count}, ignore_index=True)


In [57]:
analysis_results.to_csv("sentiment_analysis_results.csv", index=False)
analysis_results = pd.read_csv("sentiment_analysis_results.csv")
## Load the analysis results from the CSV file

In [58]:
fake_positive_count = analysis_results[analysis_results.index < 50]["the number of positive sentences in text"].sum()
fake_negative_count = analysis_results[analysis_results.index < 50]["the number of negative sentences in text"].sum()
# Calculate the total counts of positive and negative sentences for fake news articles

We examine whether fake news content contains more positive or negative sentences. The code calculates the percentage of positive and negative sentences in both fake and real news articles. You can draw conclusions based on these percentages.

In [59]:
real_positive_count = analysis_results[analysis_results.index >= 50]["the number of positive sentences in text"].sum()
real_negative_count = analysis_results[analysis_results.index >= 50]["the number of negative sentences in text"].sum()
# Calculate the total counts of positive and negative sentences for real news articles

In [60]:
total_fake_sentences = len(analysis_results[analysis_results.index < 50])
total_real_sentences = len(analysis_results[analysis_results.index >= 50])
#Calculating the percentages

In [61]:
percentage_positive_fake = (fake_positive_count / total_fake_sentences) * 100
percentage_negative_fake = (fake_negative_count / total_fake_sentences) * 100

In [62]:
percentage_positive_real = (real_positive_count / total_real_sentences) * 100
percentage_negative_real = (real_negative_count / total_real_sentences) * 100

In [63]:
print(f"Percentage of positive sentences in fake news articles: {percentage_positive_fake:.2f}%")
print(f"Percentage of negative sentences in fake news articles: {percentage_negative_fake:.2f}%")
print(f"Percentage of positive sentences in real news articles: {percentage_positive_real:.2f}%")
print(f"Percentage of negative sentences in real news articles: {percentage_negative_real:.2f}%")
## Comparing the percentages obtained to draw conclusions about whether fake news content tends to be more positive or negative

Percentage of positive sentences in fake news articles: 582.00%
Percentage of negative sentences in fake news articles: 384.00%
Percentage of positive sentences in real news articles: 664.00%
Percentage of negative sentences in real news articles: 356.00%


Among the analyzed fake news articles, 582% of the sentences are classified as positive.
384% of the sentences in fake news articles are classified as negative.
Among the analyzed real news articles, 664% of the sentences are classified as positive.
356% of the sentences in real news articles are classified as negative

In [64]:
if percentage_positive_fake > percentage_positive_real:
    print("Fake news articles tend to contain more positive sentences.")
elif percentage_positive_fake < percentage_positive_real:
    print("Real news articles tend to contain more positive sentences.")
else:
    print("The sentiment distribution is similar between fake and real news articles.")

Real news articles tend to contain more positive sentences.


On completing this sentiment analysis, I have learnt the following:
The importance of data preprocessing: Cleaning and preparing  text data is crucial for accurate sentiment analysis.
The choice of features: Experimenting with different features and preprocessing techniques can significantly impact the results.
Sentiment analysis as a tool is used for understanding the emotional tone of text data, which has applications in various fields.
I also understood the impact of fake news on sentiment.
