In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
# Assuming 'df' is the DataFrame containing the fake news dataset
# Replace 'your_dataset.csv' with the actual file path or method to load your dataset
df = pd.read_csv('twcs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811774 entries, 0 to 2811773
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   tweet_id                 int64  
 1   author_id                object 
 2   inbound                  bool   
 3   created_at               object 
 4   text                     object 
 5   response_tweet_id        object 
 6   in_response_to_tweet_id  float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 131.4+ MB


In [4]:
df.describe()

Unnamed: 0,tweet_id,in_response_to_tweet_id
count,2811774.0,2017439.0
mean,1504565.0,1463141.0
std,861645.0,866573.0
min,1.0,1.0
25%,760165.2,715510.5
50%,1507772.0,1439805.0
75%,2253296.0,2220646.0
max,2987950.0,2987950.0


In [5]:
df.isnull().sum()

tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id     794335
dtype: int64

In [6]:
# Handling missing values
# Remove rows with any missing values
df.dropna(inplace=True)

In [7]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [13]:
# Text preprocessing
stop_words = set(stopwords.words('english'))

In [14]:
def preprocess_text(text):
    # Function to clean and preprocess text
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and symbols
    text = ' '.join([word for word in text.split() if word.isalnum()])
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [15]:
# Apply text preprocessing to 'title' and 'text' columns
df['text_cleaned'] = df['text'].apply(preprocess_text)

In [16]:
# Sentiment analysis using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    # Get polarity score
    return analysis.sentiment.polarity

In [17]:
# Apply sentiment analysis to 'text' column
df['text_sentiment'] = df['text_cleaned'].apply(get_sentiment)

In [18]:
# Classify sentiments into positive, negative, or neutral
def get_sentiment_category(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

In [19]:
# Apply sentiment category classification
df['sentiment_category'] = df['text_sentiment'].apply(get_sentiment_category)

In [20]:
# Display results of sentiment categories
print(df['sentiment_category'].value_counts())

Neutral     592025
Positive    253629
Negative    131156
Name: sentiment_category, dtype: int64


In [None]:
# Visualize sentiment distribution
plt.figure(figsize=(8, 6))
df['sentiment_category'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Sentiment Category Distribution')
plt.xlabel('Sentiment Category')
plt.ylabel('Frequency')
plt.show()


In [21]:
# Save DataFrame to Excel file with sentiment scores and categories
df.to_excel('sentiment_analysis_results.xlsx', index=False)